Commit | Line | Data |
---|---|---|
44f54ceb | 1 | /* |
44f54ceb MN |
2 | * Copyright (C) 2003 David S. Miller <davem@redhat.com> |
3 | * | |
2912e87a | 4 | * This file is part of Libav. |
44f54ceb | 5 | * |
2912e87a | 6 | * Libav is free software; you can redistribute it and/or |
a33fe572 DB |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2.1 of the License, or (at your option) any later version. | |
44f54ceb | 10 | * |
2912e87a | 11 | * Libav is distributed in the hope that it will be useful, |
44f54ceb | 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
a33fe572 DB |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. | |
44f54ceb | 15 | * |
a33fe572 | 16 | * You should have received a copy of the GNU Lesser General Public |
2912e87a | 17 | * License along with Libav; if not, write to the Free Software |
5509bffa | 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
44f54ceb MN |
19 | */ |
20 | ||
0f12310f | 21 | /* The *no_round* functions have been added by James A. Morrison, 2003,2004. |
2f5df0b1 | 22 | The vis code from libmpeg2 was adapted for libavcodec by James A. Morrison. |
44f54ceb MN |
23 | */ |
24 | ||
25 | #include "config.h" | |
26 | ||
44f54ceb MN |
27 | #include <inttypes.h> |
28 | ||
245976da | 29 | #include "libavcodec/dsputil.h" |
33e11284 | 30 | #include "libavutil/mem.h" |
ad403802 | 31 | #include "dsputil_vis.h" |
44f54ceb MN |
32 | |
33 | #include "vis.h" | |
34 | ||
35 | /* The trick used in some of this file is the formula from the MMX | |
36 | * motion comp code, which is: | |
37 | * | |
38 | * (x+y+1)>>1 == (x|y)-((x^y)>>1) | |
39 | * | |
40 | * This allows us to average 8 bytes at a time in a 64-bit FPU reg. | |
41 | * We avoid overflows by masking before we do the shift, and we | |
42 | * implement the shift by multiplying by 1/2 using mul8x16. So in | |
43 | * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask | |
44 | * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and | |
45 | * the value 0x80808080 is in f8): | |
46 | * | |
bb270c08 DB |
47 | * fxor f0, f2, f10 |
48 | * fand f10, f4, f10 | |
49 | * fmul8x16 f8, f10, f10 | |
50 | * fand f10, f6, f10 | |
51 | * for f0, f2, f12 | |
52 | * fpsub16 f12, f10, f10 | |
44f54ceb MN |
53 | */ |
54 | ||
44f54ceb MN |
55 | #define DUP4(x) {x, x, x, x} |
56 | #define DUP8(x) {x, x, x, x, x, x, x, x} | |
d343d598 MR |
57 | DECLARE_ALIGNED(8, static const int16_t, constants1)[] = DUP4 (1); |
58 | DECLARE_ALIGNED(8, static const int16_t, constants2)[] = DUP4 (2); | |
59 | DECLARE_ALIGNED(8, static const int16_t, constants3)[] = DUP4 (3); | |
60 | DECLARE_ALIGNED(8, static const int16_t, constants6)[] = DUP4 (6); | |
61 | DECLARE_ALIGNED(8, static const int8_t, constants_fe)[] = DUP8 (0xfe); | |
62 | DECLARE_ALIGNED(8, static const int8_t, constants_7f)[] = DUP8 (0x7f); | |
63 | DECLARE_ALIGNED(8, static const int8_t, constants128)[] = DUP8 (128); | |
64 | DECLARE_ALIGNED(8, static const int16_t, constants256_512)[] = | |
bb270c08 | 65 | {256, 512, 256, 512}; |
d343d598 | 66 | DECLARE_ALIGNED(8, static const int16_t, constants256_1024)[] = |
bb270c08 DB |
67 | {256, 1024, 256, 1024}; |
68 | ||
69 | #define REF_0 0 | |
70 | #define REF_0_1 1 | |
71 | #define REF_2 2 | |
72 | #define REF_2_1 3 | |
73 | #define REF_4 4 | |
74 | #define REF_4_1 5 | |
75 | #define REF_6 6 | |
76 | #define REF_6_1 7 | |
77 | #define REF_S0 8 | |
78 | #define REF_S0_1 9 | |
79 | #define REF_S2 10 | |
80 | #define REF_S2_1 11 | |
81 | #define REF_S4 12 | |
82 | #define REF_S4_1 13 | |
83 | #define REF_S6 14 | |
84 | #define REF_S6_1 15 | |
85 | #define DST_0 16 | |
86 | #define DST_1 17 | |
87 | #define DST_2 18 | |
88 | #define DST_3 19 | |
89 | #define CONST_1 20 | |
90 | #define CONST_2 20 | |
91 | #define CONST_3 20 | |
92 | #define CONST_6 20 | |
93 | #define MASK_fe 20 | |
94 | #define CONST_128 22 | |
95 | #define CONST_256 22 | |
96 | #define CONST_512 22 | |
97 | #define CONST_1024 22 | |
98 | #define TMP0 24 | |
99 | #define TMP1 25 | |
100 | #define TMP2 26 | |
101 | #define TMP3 27 | |
102 | #define TMP4 28 | |
103 | #define TMP5 29 | |
104 | #define ZERO 30 | |
105 | #define MASK_7f 30 | |
106 | ||
107 | #define TMP6 32 | |
108 | #define TMP8 34 | |
109 | #define TMP10 36 | |
110 | #define TMP12 38 | |
111 | #define TMP14 40 | |
112 | #define TMP16 42 | |
113 | #define TMP18 44 | |
114 | #define TMP20 46 | |
115 | #define TMP22 48 | |
116 | #define TMP24 50 | |
117 | #define TMP26 52 | |
118 | #define TMP28 54 | |
119 | #define TMP30 56 | |
120 | #define TMP32 58 | |
44f54ceb | 121 | |
86decad6 | 122 | static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 123 | const int stride, int height) |
44f54ceb | 124 | { |
bb270c08 DB |
125 | ref = vis_alignaddr(ref); |
126 | do { /* 5 cycles */ | |
127 | vis_ld64(ref[0], TMP0); | |
44f54ceb | 128 | |
bb270c08 | 129 | vis_ld64_2(ref, 8, TMP2); |
44f54ceb | 130 | |
bb270c08 DB |
131 | vis_ld64_2(ref, 16, TMP4); |
132 | ref += stride; | |
44f54ceb | 133 | |
bb270c08 DB |
134 | vis_faligndata(TMP0, TMP2, REF_0); |
135 | vis_st64(REF_0, dest[0]); | |
44f54ceb | 136 | |
bb270c08 DB |
137 | vis_faligndata(TMP2, TMP4, REF_2); |
138 | vis_st64_2(REF_2, dest, 8); | |
139 | dest += stride; | |
140 | } while (--height); | |
44f54ceb MN |
141 | } |
142 | ||
86decad6 | 143 | static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 144 | const int stride, int height) |
44f54ceb | 145 | { |
bb270c08 DB |
146 | ref = vis_alignaddr(ref); |
147 | do { /* 4 cycles */ | |
148 | vis_ld64(ref[0], TMP0); | |
44f54ceb | 149 | |
bb270c08 DB |
150 | vis_ld64(ref[8], TMP2); |
151 | ref += stride; | |
44f54ceb | 152 | |
bb270c08 | 153 | /* stall */ |
44f54ceb | 154 | |
bb270c08 DB |
155 | vis_faligndata(TMP0, TMP2, REF_0); |
156 | vis_st64(REF_0, dest[0]); | |
157 | dest += stride; | |
158 | } while (--height); | |
44f54ceb MN |
159 | } |
160 | ||
161 | ||
86decad6 | 162 | static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 163 | const int stride, int height) |
44f54ceb | 164 | { |
bb270c08 | 165 | int stride_8 = stride + 8; |
44f54ceb | 166 | |
bb270c08 | 167 | ref = vis_alignaddr(ref); |
44f54ceb | 168 | |
bb270c08 | 169 | vis_ld64(ref[0], TMP0); |
44f54ceb | 170 | |
bb270c08 | 171 | vis_ld64(ref[8], TMP2); |
44f54ceb | 172 | |
bb270c08 | 173 | vis_ld64(ref[16], TMP4); |
44f54ceb | 174 | |
bb270c08 | 175 | vis_ld64(dest[0], DST_0); |
44f54ceb | 176 | |
bb270c08 | 177 | vis_ld64(dest[8], DST_2); |
44f54ceb | 178 | |
bb270c08 DB |
179 | vis_ld64(constants_fe[0], MASK_fe); |
180 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 181 | |
bb270c08 DB |
182 | vis_ld64(constants_7f[0], MASK_7f); |
183 | vis_faligndata(TMP2, TMP4, REF_2); | |
44f54ceb | 184 | |
bb270c08 | 185 | vis_ld64(constants128[0], CONST_128); |
44f54ceb | 186 | |
bb270c08 DB |
187 | ref += stride; |
188 | height = (height >> 1) - 1; | |
44f54ceb | 189 | |
bb270c08 DB |
190 | do { /* 24 cycles */ |
191 | vis_ld64(ref[0], TMP0); | |
192 | vis_xor(DST_0, REF_0, TMP6); | |
44f54ceb | 193 | |
bb270c08 DB |
194 | vis_ld64_2(ref, 8, TMP2); |
195 | vis_and(TMP6, MASK_fe, TMP6); | |
44f54ceb | 196 | |
bb270c08 DB |
197 | vis_ld64_2(ref, 16, TMP4); |
198 | ref += stride; | |
199 | vis_mul8x16(CONST_128, TMP6, TMP6); | |
200 | vis_xor(DST_2, REF_2, TMP8); | |
44f54ceb | 201 | |
bb270c08 | 202 | vis_and(TMP8, MASK_fe, TMP8); |
44f54ceb | 203 | |
bb270c08 DB |
204 | vis_or(DST_0, REF_0, TMP10); |
205 | vis_ld64_2(dest, stride, DST_0); | |
206 | vis_mul8x16(CONST_128, TMP8, TMP8); | |
44f54ceb | 207 | |
bb270c08 DB |
208 | vis_or(DST_2, REF_2, TMP12); |
209 | vis_ld64_2(dest, stride_8, DST_2); | |
44f54ceb | 210 | |
bb270c08 DB |
211 | vis_ld64(ref[0], TMP14); |
212 | vis_and(TMP6, MASK_7f, TMP6); | |
44f54ceb | 213 | |
bb270c08 | 214 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 215 | |
bb270c08 DB |
216 | vis_psub16(TMP10, TMP6, TMP6); |
217 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 218 | |
bb270c08 DB |
219 | vis_psub16(TMP12, TMP8, TMP8); |
220 | vis_st64_2(TMP8, dest, 8); | |
44f54ceb | 221 | |
bb270c08 DB |
222 | dest += stride; |
223 | vis_ld64_2(ref, 8, TMP16); | |
224 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 225 | |
bb270c08 DB |
226 | vis_ld64_2(ref, 16, TMP18); |
227 | vis_faligndata(TMP2, TMP4, REF_2); | |
228 | ref += stride; | |
44f54ceb | 229 | |
bb270c08 | 230 | vis_xor(DST_0, REF_0, TMP20); |
44f54ceb | 231 | |
bb270c08 | 232 | vis_and(TMP20, MASK_fe, TMP20); |
44f54ceb | 233 | |
bb270c08 DB |
234 | vis_xor(DST_2, REF_2, TMP22); |
235 | vis_mul8x16(CONST_128, TMP20, TMP20); | |
44f54ceb | 236 | |
bb270c08 | 237 | vis_and(TMP22, MASK_fe, TMP22); |
44f54ceb | 238 | |
bb270c08 DB |
239 | vis_or(DST_0, REF_0, TMP24); |
240 | vis_mul8x16(CONST_128, TMP22, TMP22); | |
44f54ceb | 241 | |
bb270c08 | 242 | vis_or(DST_2, REF_2, TMP26); |
44f54ceb | 243 | |
bb270c08 DB |
244 | vis_ld64_2(dest, stride, DST_0); |
245 | vis_faligndata(TMP14, TMP16, REF_0); | |
44f54ceb | 246 | |
bb270c08 DB |
247 | vis_ld64_2(dest, stride_8, DST_2); |
248 | vis_faligndata(TMP16, TMP18, REF_2); | |
44f54ceb | 249 | |
bb270c08 | 250 | vis_and(TMP20, MASK_7f, TMP20); |
44f54ceb | 251 | |
bb270c08 | 252 | vis_and(TMP22, MASK_7f, TMP22); |
44f54ceb | 253 | |
bb270c08 DB |
254 | vis_psub16(TMP24, TMP20, TMP20); |
255 | vis_st64(TMP20, dest[0]); | |
44f54ceb | 256 | |
bb270c08 DB |
257 | vis_psub16(TMP26, TMP22, TMP22); |
258 | vis_st64_2(TMP22, dest, 8); | |
259 | dest += stride; | |
260 | } while (--height); | |
44f54ceb | 261 | |
bb270c08 DB |
262 | vis_ld64(ref[0], TMP0); |
263 | vis_xor(DST_0, REF_0, TMP6); | |
44f54ceb | 264 | |
bb270c08 DB |
265 | vis_ld64_2(ref, 8, TMP2); |
266 | vis_and(TMP6, MASK_fe, TMP6); | |
44f54ceb | 267 | |
bb270c08 DB |
268 | vis_ld64_2(ref, 16, TMP4); |
269 | vis_mul8x16(CONST_128, TMP6, TMP6); | |
270 | vis_xor(DST_2, REF_2, TMP8); | |
44f54ceb | 271 | |
bb270c08 | 272 | vis_and(TMP8, MASK_fe, TMP8); |
44f54ceb | 273 | |
bb270c08 DB |
274 | vis_or(DST_0, REF_0, TMP10); |
275 | vis_ld64_2(dest, stride, DST_0); | |
276 | vis_mul8x16(CONST_128, TMP8, TMP8); | |
44f54ceb | 277 | |
bb270c08 DB |
278 | vis_or(DST_2, REF_2, TMP12); |
279 | vis_ld64_2(dest, stride_8, DST_2); | |
44f54ceb | 280 | |
bb270c08 DB |
281 | vis_ld64(ref[0], TMP14); |
282 | vis_and(TMP6, MASK_7f, TMP6); | |
44f54ceb | 283 | |
bb270c08 | 284 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 285 | |
bb270c08 DB |
286 | vis_psub16(TMP10, TMP6, TMP6); |
287 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 288 | |
bb270c08 DB |
289 | vis_psub16(TMP12, TMP8, TMP8); |
290 | vis_st64_2(TMP8, dest, 8); | |
44f54ceb | 291 | |
bb270c08 DB |
292 | dest += stride; |
293 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 294 | |
bb270c08 | 295 | vis_faligndata(TMP2, TMP4, REF_2); |
44f54ceb | 296 | |
bb270c08 | 297 | vis_xor(DST_0, REF_0, TMP20); |
44f54ceb | 298 | |
bb270c08 | 299 | vis_and(TMP20, MASK_fe, TMP20); |
44f54ceb | 300 | |
bb270c08 DB |
301 | vis_xor(DST_2, REF_2, TMP22); |
302 | vis_mul8x16(CONST_128, TMP20, TMP20); | |
44f54ceb | 303 | |
bb270c08 | 304 | vis_and(TMP22, MASK_fe, TMP22); |
44f54ceb | 305 | |
bb270c08 DB |
306 | vis_or(DST_0, REF_0, TMP24); |
307 | vis_mul8x16(CONST_128, TMP22, TMP22); | |
44f54ceb | 308 | |
bb270c08 | 309 | vis_or(DST_2, REF_2, TMP26); |
44f54ceb | 310 | |
bb270c08 | 311 | vis_and(TMP20, MASK_7f, TMP20); |
44f54ceb | 312 | |
bb270c08 | 313 | vis_and(TMP22, MASK_7f, TMP22); |
44f54ceb | 314 | |
bb270c08 DB |
315 | vis_psub16(TMP24, TMP20, TMP20); |
316 | vis_st64(TMP20, dest[0]); | |
44f54ceb | 317 | |
bb270c08 DB |
318 | vis_psub16(TMP26, TMP22, TMP22); |
319 | vis_st64_2(TMP22, dest, 8); | |
44f54ceb MN |
320 | } |
321 | ||
86decad6 | 322 | static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 323 | const int stride, int height) |
44f54ceb | 324 | { |
bb270c08 | 325 | ref = vis_alignaddr(ref); |
44f54ceb | 326 | |
bb270c08 | 327 | vis_ld64(ref[0], TMP0); |
44f54ceb | 328 | |
bb270c08 | 329 | vis_ld64(ref[8], TMP2); |
44f54ceb | 330 | |
bb270c08 | 331 | vis_ld64(dest[0], DST_0); |
44f54ceb | 332 | |
bb270c08 | 333 | vis_ld64(constants_fe[0], MASK_fe); |
44f54ceb | 334 | |
bb270c08 DB |
335 | vis_ld64(constants_7f[0], MASK_7f); |
336 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 337 | |
bb270c08 | 338 | vis_ld64(constants128[0], CONST_128); |
44f54ceb | 339 | |
bb270c08 DB |
340 | ref += stride; |
341 | height = (height >> 1) - 1; | |
44f54ceb | 342 | |
bb270c08 DB |
343 | do { /* 12 cycles */ |
344 | vis_ld64(ref[0], TMP0); | |
345 | vis_xor(DST_0, REF_0, TMP4); | |
44f54ceb | 346 | |
bb270c08 DB |
347 | vis_ld64(ref[8], TMP2); |
348 | vis_and(TMP4, MASK_fe, TMP4); | |
44f54ceb | 349 | |
bb270c08 DB |
350 | vis_or(DST_0, REF_0, TMP6); |
351 | vis_ld64_2(dest, stride, DST_0); | |
352 | ref += stride; | |
353 | vis_mul8x16(CONST_128, TMP4, TMP4); | |
44f54ceb | 354 | |
bb270c08 DB |
355 | vis_ld64(ref[0], TMP12); |
356 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 357 | |
bb270c08 DB |
358 | vis_ld64(ref[8], TMP2); |
359 | vis_xor(DST_0, REF_0, TMP0); | |
360 | ref += stride; | |
44f54ceb | 361 | |
bb270c08 | 362 | vis_and(TMP0, MASK_fe, TMP0); |
44f54ceb | 363 | |
bb270c08 | 364 | vis_and(TMP4, MASK_7f, TMP4); |
44f54ceb | 365 | |
bb270c08 DB |
366 | vis_psub16(TMP6, TMP4, TMP4); |
367 | vis_st64(TMP4, dest[0]); | |
368 | dest += stride; | |
369 | vis_mul8x16(CONST_128, TMP0, TMP0); | |
44f54ceb | 370 | |
bb270c08 DB |
371 | vis_or(DST_0, REF_0, TMP6); |
372 | vis_ld64_2(dest, stride, DST_0); | |
44f54ceb | 373 | |
bb270c08 | 374 | vis_faligndata(TMP12, TMP2, REF_0); |
44f54ceb | 375 | |
bb270c08 | 376 | vis_and(TMP0, MASK_7f, TMP0); |
44f54ceb | 377 | |
bb270c08 DB |
378 | vis_psub16(TMP6, TMP0, TMP4); |
379 | vis_st64(TMP4, dest[0]); | |
380 | dest += stride; | |
381 | } while (--height); | |
44f54ceb | 382 | |
bb270c08 DB |
383 | vis_ld64(ref[0], TMP0); |
384 | vis_xor(DST_0, REF_0, TMP4); | |
44f54ceb | 385 | |
bb270c08 DB |
386 | vis_ld64(ref[8], TMP2); |
387 | vis_and(TMP4, MASK_fe, TMP4); | |
44f54ceb | 388 | |
bb270c08 DB |
389 | vis_or(DST_0, REF_0, TMP6); |
390 | vis_ld64_2(dest, stride, DST_0); | |
391 | vis_mul8x16(CONST_128, TMP4, TMP4); | |
44f54ceb | 392 | |
bb270c08 | 393 | vis_faligndata(TMP0, TMP2, REF_0); |
44f54ceb | 394 | |
bb270c08 | 395 | vis_xor(DST_0, REF_0, TMP0); |
44f54ceb | 396 | |
bb270c08 | 397 | vis_and(TMP0, MASK_fe, TMP0); |
44f54ceb | 398 | |
bb270c08 | 399 | vis_and(TMP4, MASK_7f, TMP4); |
44f54ceb | 400 | |
bb270c08 DB |
401 | vis_psub16(TMP6, TMP4, TMP4); |
402 | vis_st64(TMP4, dest[0]); | |
403 | dest += stride; | |
404 | vis_mul8x16(CONST_128, TMP0, TMP0); | |
44f54ceb | 405 | |
bb270c08 | 406 | vis_or(DST_0, REF_0, TMP6); |
44f54ceb | 407 | |
bb270c08 | 408 | vis_and(TMP0, MASK_7f, TMP0); |
44f54ceb | 409 | |
bb270c08 DB |
410 | vis_psub16(TMP6, TMP0, TMP4); |
411 | vis_st64(TMP4, dest[0]); | |
44f54ceb MN |
412 | } |
413 | ||
86decad6 | 414 | static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 415 | const int stride, int height) |
44f54ceb | 416 | { |
bb270c08 DB |
417 | unsigned long off = (unsigned long) ref & 0x7; |
418 | unsigned long off_plus_1 = off + 1; | |
44f54ceb | 419 | |
bb270c08 | 420 | ref = vis_alignaddr(ref); |
44f54ceb | 421 | |
bb270c08 | 422 | vis_ld64(ref[0], TMP0); |
44f54ceb | 423 | |
bb270c08 | 424 | vis_ld64_2(ref, 8, TMP2); |
44f54ceb | 425 | |
bb270c08 | 426 | vis_ld64_2(ref, 16, TMP4); |
44f54ceb | 427 | |
bb270c08 | 428 | vis_ld64(constants_fe[0], MASK_fe); |
44f54ceb | 429 | |
bb270c08 DB |
430 | vis_ld64(constants_7f[0], MASK_7f); |
431 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 432 | |
bb270c08 DB |
433 | vis_ld64(constants128[0], CONST_128); |
434 | vis_faligndata(TMP2, TMP4, REF_4); | |
44f54ceb | 435 | |
bb270c08 DB |
436 | if (off != 0x7) { |
437 | vis_alignaddr_g0((void *)off_plus_1); | |
438 | vis_faligndata(TMP0, TMP2, REF_2); | |
439 | vis_faligndata(TMP2, TMP4, REF_6); | |
440 | } else { | |
441 | vis_src1(TMP2, REF_2); | |
442 | vis_src1(TMP4, REF_6); | |
443 | } | |
44f54ceb | 444 | |
bb270c08 DB |
445 | ref += stride; |
446 | height = (height >> 1) - 1; | |
44f54ceb | 447 | |
bb270c08 DB |
448 | do { /* 34 cycles */ |
449 | vis_ld64(ref[0], TMP0); | |
450 | vis_xor(REF_0, REF_2, TMP6); | |
44f54ceb | 451 | |
bb270c08 DB |
452 | vis_ld64_2(ref, 8, TMP2); |
453 | vis_xor(REF_4, REF_6, TMP8); | |
44f54ceb | 454 | |
bb270c08 DB |
455 | vis_ld64_2(ref, 16, TMP4); |
456 | vis_and(TMP6, MASK_fe, TMP6); | |
457 | ref += stride; | |
44f54ceb | 458 | |
bb270c08 DB |
459 | vis_ld64(ref[0], TMP14); |
460 | vis_mul8x16(CONST_128, TMP6, TMP6); | |
461 | vis_and(TMP8, MASK_fe, TMP8); | |
44f54ceb | 462 | |
bb270c08 DB |
463 | vis_ld64_2(ref, 8, TMP16); |
464 | vis_mul8x16(CONST_128, TMP8, TMP8); | |
465 | vis_or(REF_0, REF_2, TMP10); | |
44f54ceb | 466 | |
bb270c08 DB |
467 | vis_ld64_2(ref, 16, TMP18); |
468 | ref += stride; | |
469 | vis_or(REF_4, REF_6, TMP12); | |
44f54ceb | 470 | |
bb270c08 | 471 | vis_alignaddr_g0((void *)off); |
44f54ceb | 472 | |
bb270c08 | 473 | vis_faligndata(TMP0, TMP2, REF_0); |
44f54ceb | 474 | |
bb270c08 | 475 | vis_faligndata(TMP2, TMP4, REF_4); |
44f54ceb | 476 | |
bb270c08 DB |
477 | if (off != 0x7) { |
478 | vis_alignaddr_g0((void *)off_plus_1); | |
479 | vis_faligndata(TMP0, TMP2, REF_2); | |
480 | vis_faligndata(TMP2, TMP4, REF_6); | |
481 | } else { | |
482 | vis_src1(TMP2, REF_2); | |
483 | vis_src1(TMP4, REF_6); | |
484 | } | |
44f54ceb | 485 | |
bb270c08 | 486 | vis_and(TMP6, MASK_7f, TMP6); |
44f54ceb | 487 | |
bb270c08 | 488 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 489 | |
bb270c08 DB |
490 | vis_psub16(TMP10, TMP6, TMP6); |
491 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 492 | |
bb270c08 DB |
493 | vis_psub16(TMP12, TMP8, TMP8); |
494 | vis_st64_2(TMP8, dest, 8); | |
495 | dest += stride; | |
44f54ceb | 496 | |
bb270c08 | 497 | vis_xor(REF_0, REF_2, TMP6); |
44f54ceb | 498 | |
bb270c08 | 499 | vis_xor(REF_4, REF_6, TMP8); |
44f54ceb | 500 | |
bb270c08 | 501 | vis_and(TMP6, MASK_fe, TMP6); |
44f54ceb | 502 | |
bb270c08 DB |
503 | vis_mul8x16(CONST_128, TMP6, TMP6); |
504 | vis_and(TMP8, MASK_fe, TMP8); | |
44f54ceb | 505 | |
bb270c08 DB |
506 | vis_mul8x16(CONST_128, TMP8, TMP8); |
507 | vis_or(REF_0, REF_2, TMP10); | |
44f54ceb | 508 | |
bb270c08 | 509 | vis_or(REF_4, REF_6, TMP12); |
44f54ceb | 510 | |
bb270c08 | 511 | vis_alignaddr_g0((void *)off); |
44f54ceb | 512 | |
bb270c08 | 513 | vis_faligndata(TMP14, TMP16, REF_0); |
44f54ceb | 514 | |
bb270c08 | 515 | vis_faligndata(TMP16, TMP18, REF_4); |
44f54ceb | 516 | |
bb270c08 DB |
517 | if (off != 0x7) { |
518 | vis_alignaddr_g0((void *)off_plus_1); | |
519 | vis_faligndata(TMP14, TMP16, REF_2); | |
520 | vis_faligndata(TMP16, TMP18, REF_6); | |
521 | } else { | |
522 | vis_src1(TMP16, REF_2); | |
523 | vis_src1(TMP18, REF_6); | |
524 | } | |
44f54ceb | 525 | |
bb270c08 | 526 | vis_and(TMP6, MASK_7f, TMP6); |
44f54ceb | 527 | |
bb270c08 | 528 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 529 | |
bb270c08 DB |
530 | vis_psub16(TMP10, TMP6, TMP6); |
531 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 532 | |
bb270c08 DB |
533 | vis_psub16(TMP12, TMP8, TMP8); |
534 | vis_st64_2(TMP8, dest, 8); | |
535 | dest += stride; | |
536 | } while (--height); | |
44f54ceb | 537 | |
bb270c08 DB |
538 | vis_ld64(ref[0], TMP0); |
539 | vis_xor(REF_0, REF_2, TMP6); | |
44f54ceb | 540 | |
bb270c08 DB |
541 | vis_ld64_2(ref, 8, TMP2); |
542 | vis_xor(REF_4, REF_6, TMP8); | |
44f54ceb | 543 | |
bb270c08 DB |
544 | vis_ld64_2(ref, 16, TMP4); |
545 | vis_and(TMP6, MASK_fe, TMP6); | |
44f54ceb | 546 | |
bb270c08 DB |
547 | vis_mul8x16(CONST_128, TMP6, TMP6); |
548 | vis_and(TMP8, MASK_fe, TMP8); | |
44f54ceb | 549 | |
bb270c08 DB |
550 | vis_mul8x16(CONST_128, TMP8, TMP8); |
551 | vis_or(REF_0, REF_2, TMP10); | |
44f54ceb | 552 | |
bb270c08 | 553 | vis_or(REF_4, REF_6, TMP12); |
44f54ceb | 554 | |
bb270c08 | 555 | vis_alignaddr_g0((void *)off); |
44f54ceb | 556 | |
bb270c08 | 557 | vis_faligndata(TMP0, TMP2, REF_0); |
44f54ceb | 558 | |
bb270c08 | 559 | vis_faligndata(TMP2, TMP4, REF_4); |
44f54ceb | 560 | |
bb270c08 DB |
561 | if (off != 0x7) { |
562 | vis_alignaddr_g0((void *)off_plus_1); | |
563 | vis_faligndata(TMP0, TMP2, REF_2); | |
564 | vis_faligndata(TMP2, TMP4, REF_6); | |
565 | } else { | |
566 | vis_src1(TMP2, REF_2); | |
567 | vis_src1(TMP4, REF_6); | |
568 | } | |
44f54ceb | 569 | |
bb270c08 | 570 | vis_and(TMP6, MASK_7f, TMP6); |
44f54ceb | 571 | |
bb270c08 | 572 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 573 | |
bb270c08 DB |
574 | vis_psub16(TMP10, TMP6, TMP6); |
575 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 576 | |
bb270c08 DB |
577 | vis_psub16(TMP12, TMP8, TMP8); |
578 | vis_st64_2(TMP8, dest, 8); | |
579 | dest += stride; | |
44f54ceb | 580 | |
bb270c08 | 581 | vis_xor(REF_0, REF_2, TMP6); |
44f54ceb | 582 | |
bb270c08 | 583 | vis_xor(REF_4, REF_6, TMP8); |
44f54ceb | 584 | |
bb270c08 | 585 | vis_and(TMP6, MASK_fe, TMP6); |
44f54ceb | 586 | |
bb270c08 DB |
587 | vis_mul8x16(CONST_128, TMP6, TMP6); |
588 | vis_and(TMP8, MASK_fe, TMP8); | |
44f54ceb | 589 | |
bb270c08 DB |
590 | vis_mul8x16(CONST_128, TMP8, TMP8); |
591 | vis_or(REF_0, REF_2, TMP10); | |
44f54ceb | 592 | |
bb270c08 | 593 | vis_or(REF_4, REF_6, TMP12); |
44f54ceb | 594 | |
bb270c08 | 595 | vis_and(TMP6, MASK_7f, TMP6); |
44f54ceb | 596 | |
bb270c08 | 597 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 598 | |
bb270c08 DB |
599 | vis_psub16(TMP10, TMP6, TMP6); |
600 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 601 | |
bb270c08 DB |
602 | vis_psub16(TMP12, TMP8, TMP8); |
603 | vis_st64_2(TMP8, dest, 8); | |
44f54ceb MN |
604 | } |
605 | ||
86decad6 | 606 | static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 607 | const int stride, int height) |
44f54ceb | 608 | { |
bb270c08 DB |
609 | unsigned long off = (unsigned long) ref & 0x7; |
610 | unsigned long off_plus_1 = off + 1; | |
44f54ceb | 611 | |
bb270c08 | 612 | ref = vis_alignaddr(ref); |
44f54ceb | 613 | |
bb270c08 | 614 | vis_ld64(ref[0], TMP0); |
44f54ceb | 615 | |
bb270c08 | 616 | vis_ld64(ref[8], TMP2); |
44f54ceb | 617 | |
bb270c08 | 618 | vis_ld64(constants_fe[0], MASK_fe); |
44f54ceb | 619 | |
bb270c08 | 620 | vis_ld64(constants_7f[0], MASK_7f); |
44f54ceb | 621 | |
bb270c08 DB |
622 | vis_ld64(constants128[0], CONST_128); |
623 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 624 | |
bb270c08 DB |
625 | if (off != 0x7) { |
626 | vis_alignaddr_g0((void *)off_plus_1); | |
627 | vis_faligndata(TMP0, TMP2, REF_2); | |
628 | } else { | |
629 | vis_src1(TMP2, REF_2); | |
630 | } | |
44f54ceb | 631 | |
bb270c08 DB |
632 | ref += stride; |
633 | height = (height >> 1) - 1; | |
44f54ceb | 634 | |
bb270c08 DB |
635 | do { /* 20 cycles */ |
636 | vis_ld64(ref[0], TMP0); | |
637 | vis_xor(REF_0, REF_2, TMP4); | |
44f54ceb | 638 | |
bb270c08 DB |
639 | vis_ld64_2(ref, 8, TMP2); |
640 | vis_and(TMP4, MASK_fe, TMP4); | |
641 | ref += stride; | |
44f54ceb | 642 | |
bb270c08 DB |
643 | vis_ld64(ref[0], TMP8); |
644 | vis_or(REF_0, REF_2, TMP6); | |
645 | vis_mul8x16(CONST_128, TMP4, TMP4); | |
44f54ceb | 646 | |
bb270c08 | 647 | vis_alignaddr_g0((void *)off); |
44f54ceb | 648 | |
bb270c08 DB |
649 | vis_ld64_2(ref, 8, TMP10); |
650 | ref += stride; | |
651 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 652 | |
bb270c08 DB |
653 | if (off != 0x7) { |
654 | vis_alignaddr_g0((void *)off_plus_1); | |
655 | vis_faligndata(TMP0, TMP2, REF_2); | |
656 | } else { | |
657 | vis_src1(TMP2, REF_2); | |
658 | } | |
44f54ceb | 659 | |
bb270c08 | 660 | vis_and(TMP4, MASK_7f, TMP4); |
44f54ceb | 661 | |
bb270c08 DB |
662 | vis_psub16(TMP6, TMP4, DST_0); |
663 | vis_st64(DST_0, dest[0]); | |
664 | dest += stride; | |
44f54ceb | 665 | |
bb270c08 | 666 | vis_xor(REF_0, REF_2, TMP12); |
44f54ceb | 667 | |
bb270c08 | 668 | vis_and(TMP12, MASK_fe, TMP12); |
44f54ceb | 669 | |
bb270c08 DB |
670 | vis_or(REF_0, REF_2, TMP14); |
671 | vis_mul8x16(CONST_128, TMP12, TMP12); | |
44f54ceb | 672 | |
bb270c08 DB |
673 | vis_alignaddr_g0((void *)off); |
674 | vis_faligndata(TMP8, TMP10, REF_0); | |
675 | if (off != 0x7) { | |
676 | vis_alignaddr_g0((void *)off_plus_1); | |
677 | vis_faligndata(TMP8, TMP10, REF_2); | |
678 | } else { | |
679 | vis_src1(TMP10, REF_2); | |
680 | } | |
44f54ceb | 681 | |
bb270c08 | 682 | vis_and(TMP12, MASK_7f, TMP12); |
44f54ceb | 683 | |
bb270c08 DB |
684 | vis_psub16(TMP14, TMP12, DST_0); |
685 | vis_st64(DST_0, dest[0]); | |
686 | dest += stride; | |
687 | } while (--height); | |
44f54ceb | 688 | |
bb270c08 DB |
689 | vis_ld64(ref[0], TMP0); |
690 | vis_xor(REF_0, REF_2, TMP4); | |
44f54ceb | 691 | |
bb270c08 DB |
692 | vis_ld64_2(ref, 8, TMP2); |
693 | vis_and(TMP4, MASK_fe, TMP4); | |
44f54ceb | 694 | |
bb270c08 DB |
695 | vis_or(REF_0, REF_2, TMP6); |
696 | vis_mul8x16(CONST_128, TMP4, TMP4); | |
44f54ceb | 697 | |
bb270c08 | 698 | vis_alignaddr_g0((void *)off); |
44f54ceb | 699 | |
bb270c08 | 700 | vis_faligndata(TMP0, TMP2, REF_0); |
44f54ceb | 701 | |
bb270c08 DB |
702 | if (off != 0x7) { |
703 | vis_alignaddr_g0((void *)off_plus_1); | |
704 | vis_faligndata(TMP0, TMP2, REF_2); | |
705 | } else { | |
706 | vis_src1(TMP2, REF_2); | |
707 | } | |
44f54ceb | 708 | |
bb270c08 | 709 | vis_and(TMP4, MASK_7f, TMP4); |
44f54ceb | 710 | |
bb270c08 DB |
711 | vis_psub16(TMP6, TMP4, DST_0); |
712 | vis_st64(DST_0, dest[0]); | |
713 | dest += stride; | |
44f54ceb | 714 | |
bb270c08 | 715 | vis_xor(REF_0, REF_2, TMP12); |
44f54ceb | 716 | |
bb270c08 | 717 | vis_and(TMP12, MASK_fe, TMP12); |
44f54ceb | 718 | |
bb270c08 DB |
719 | vis_or(REF_0, REF_2, TMP14); |
720 | vis_mul8x16(CONST_128, TMP12, TMP12); | |
44f54ceb | 721 | |
bb270c08 | 722 | vis_and(TMP12, MASK_7f, TMP12); |
44f54ceb | 723 | |
bb270c08 DB |
724 | vis_psub16(TMP14, TMP12, DST_0); |
725 | vis_st64(DST_0, dest[0]); | |
726 | dest += stride; | |
44f54ceb MN |
727 | } |
728 | ||
86decad6 | 729 | static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 730 | const int stride, int height) |
44f54ceb | 731 | { |
bb270c08 DB |
732 | unsigned long off = (unsigned long) ref & 0x7; |
733 | unsigned long off_plus_1 = off + 1; | |
44f54ceb | 734 | |
bb270c08 | 735 | vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); |
44f54ceb | 736 | |
bb270c08 DB |
737 | vis_ld64(constants3[0], CONST_3); |
738 | vis_fzero(ZERO); | |
739 | vis_ld64(constants256_512[0], CONST_256); | |
44f54ceb | 740 | |
bb270c08 DB |
741 | ref = vis_alignaddr(ref); |
742 | do { /* 26 cycles */ | |
743 | vis_ld64(ref[0], TMP0); | |
44f54ceb | 744 | |
bb270c08 | 745 | vis_ld64(ref[8], TMP2); |
44f54ceb | 746 | |
bb270c08 | 747 | vis_alignaddr_g0((void *)off); |
44f54ceb | 748 | |
bb270c08 | 749 | vis_ld64(ref[16], TMP4); |
44f54ceb | 750 | |
bb270c08 DB |
751 | vis_ld64(dest[0], DST_0); |
752 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 753 | |
bb270c08 DB |
754 | vis_ld64(dest[8], DST_2); |
755 | vis_faligndata(TMP2, TMP4, REF_4); | |
44f54ceb | 756 | |
bb270c08 DB |
757 | if (off != 0x7) { |
758 | vis_alignaddr_g0((void *)off_plus_1); | |
759 | vis_faligndata(TMP0, TMP2, REF_2); | |
760 | vis_faligndata(TMP2, TMP4, REF_6); | |
761 | } else { | |
762 | vis_src1(TMP2, REF_2); | |
763 | vis_src1(TMP4, REF_6); | |
764 | } | |
44f54ceb | 765 | |
bb270c08 | 766 | vis_mul8x16au(REF_0, CONST_256, TMP0); |
44f54ceb | 767 | |
bb270c08 DB |
768 | vis_pmerge(ZERO, REF_2, TMP4); |
769 | vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
44f54ceb | 770 | |
bb270c08 | 771 | vis_pmerge(ZERO, REF_2_1, TMP6); |
44f54ceb | 772 | |
bb270c08 | 773 | vis_padd16(TMP0, TMP4, TMP0); |
44f54ceb | 774 | |
bb270c08 DB |
775 | vis_mul8x16al(DST_0, CONST_512, TMP4); |
776 | vis_padd16(TMP2, TMP6, TMP2); | |
44f54ceb | 777 | |
bb270c08 | 778 | vis_mul8x16al(DST_1, CONST_512, TMP6); |
44f54ceb | 779 | |
bb270c08 | 780 | vis_mul8x16au(REF_6, CONST_256, TMP12); |
44f54ceb | 781 | |
bb270c08 DB |
782 | vis_padd16(TMP0, TMP4, TMP0); |
783 | vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
44f54ceb | 784 | |
bb270c08 DB |
785 | vis_padd16(TMP2, TMP6, TMP2); |
786 | vis_mul8x16au(REF_4, CONST_256, TMP16); | |
44f54ceb | 787 | |
bb270c08 DB |
788 | vis_padd16(TMP0, CONST_3, TMP8); |
789 | vis_mul8x16au(REF_4_1, CONST_256, TMP18); | |
44f54ceb | 790 | |
bb270c08 DB |
791 | vis_padd16(TMP2, CONST_3, TMP10); |
792 | vis_pack16(TMP8, DST_0); | |
44f54ceb | 793 | |
bb270c08 DB |
794 | vis_pack16(TMP10, DST_1); |
795 | vis_padd16(TMP16, TMP12, TMP0); | |
44f54ceb | 796 | |
bb270c08 DB |
797 | vis_st64(DST_0, dest[0]); |
798 | vis_mul8x16al(DST_2, CONST_512, TMP4); | |
799 | vis_padd16(TMP18, TMP14, TMP2); | |
44f54ceb | 800 | |
bb270c08 DB |
801 | vis_mul8x16al(DST_3, CONST_512, TMP6); |
802 | vis_padd16(TMP0, CONST_3, TMP0); | |
44f54ceb | 803 | |
bb270c08 | 804 | vis_padd16(TMP2, CONST_3, TMP2); |
44f54ceb | 805 | |
bb270c08 | 806 | vis_padd16(TMP0, TMP4, TMP0); |
44f54ceb | 807 | |
bb270c08 DB |
808 | vis_padd16(TMP2, TMP6, TMP2); |
809 | vis_pack16(TMP0, DST_2); | |
44f54ceb | 810 | |
bb270c08 DB |
811 | vis_pack16(TMP2, DST_3); |
812 | vis_st64(DST_2, dest[8]); | |
44f54ceb | 813 | |
bb270c08 DB |
814 | ref += stride; |
815 | dest += stride; | |
816 | } while (--height); | |
44f54ceb MN |
817 | } |
818 | ||
86decad6 | 819 | static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 820 | const int stride, int height) |
44f54ceb | 821 | { |
bb270c08 DB |
822 | unsigned long off = (unsigned long) ref & 0x7; |
823 | unsigned long off_plus_1 = off + 1; | |
824 | int stride_times_2 = stride << 1; | |
44f54ceb | 825 | |
bb270c08 | 826 | vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); |
44f54ceb | 827 | |
bb270c08 DB |
828 | vis_ld64(constants3[0], CONST_3); |
829 | vis_fzero(ZERO); | |
830 | vis_ld64(constants256_512[0], CONST_256); | |
44f54ceb | 831 | |
bb270c08 DB |
832 | ref = vis_alignaddr(ref); |
833 | height >>= 2; | |
834 | do { /* 47 cycles */ | |
835 | vis_ld64(ref[0], TMP0); | |
44f54ceb | 836 | |
bb270c08 DB |
837 | vis_ld64_2(ref, 8, TMP2); |
838 | ref += stride; | |
44f54ceb | 839 | |
bb270c08 | 840 | vis_alignaddr_g0((void *)off); |
44f54ceb | 841 | |
bb270c08 DB |
842 | vis_ld64(ref[0], TMP4); |
843 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 844 | |
bb270c08 DB |
845 | vis_ld64_2(ref, 8, TMP6); |
846 | ref += stride; | |
44f54ceb | 847 | |
bb270c08 | 848 | vis_ld64(ref[0], TMP8); |
44f54ceb | 849 | |
bb270c08 DB |
850 | vis_ld64_2(ref, 8, TMP10); |
851 | ref += stride; | |
852 | vis_faligndata(TMP4, TMP6, REF_4); | |
44f54ceb | 853 | |
bb270c08 | 854 | vis_ld64(ref[0], TMP12); |
44f54ceb | 855 | |
bb270c08 DB |
856 | vis_ld64_2(ref, 8, TMP14); |
857 | ref += stride; | |
858 | vis_faligndata(TMP8, TMP10, REF_S0); | |
44f54ceb | 859 | |
bb270c08 | 860 | vis_faligndata(TMP12, TMP14, REF_S4); |
44f54ceb | 861 | |
bb270c08 DB |
862 | if (off != 0x7) { |
863 | vis_alignaddr_g0((void *)off_plus_1); | |
44f54ceb | 864 | |
bb270c08 DB |
865 | vis_ld64(dest[0], DST_0); |
866 | vis_faligndata(TMP0, TMP2, REF_2); | |
44f54ceb | 867 | |
bb270c08 DB |
868 | vis_ld64_2(dest, stride, DST_2); |
869 | vis_faligndata(TMP4, TMP6, REF_6); | |
44f54ceb | 870 | |
bb270c08 | 871 | vis_faligndata(TMP8, TMP10, REF_S2); |
44f54ceb | 872 | |
bb270c08 DB |
873 | vis_faligndata(TMP12, TMP14, REF_S6); |
874 | } else { | |
875 | vis_ld64(dest[0], DST_0); | |
876 | vis_src1(TMP2, REF_2); | |
44f54ceb | 877 | |
bb270c08 DB |
878 | vis_ld64_2(dest, stride, DST_2); |
879 | vis_src1(TMP6, REF_6); | |
44f54ceb | 880 | |
bb270c08 | 881 | vis_src1(TMP10, REF_S2); |
44f54ceb | 882 | |
bb270c08 DB |
883 | vis_src1(TMP14, REF_S6); |
884 | } | |
44f54ceb | 885 | |
bb270c08 DB |
886 | vis_pmerge(ZERO, REF_0, TMP0); |
887 | vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
44f54ceb | 888 | |
bb270c08 DB |
889 | vis_pmerge(ZERO, REF_2, TMP4); |
890 | vis_mul8x16au(REF_2_1, CONST_256, TMP6); | |
44f54ceb | 891 | |
bb270c08 DB |
892 | vis_padd16(TMP0, CONST_3, TMP0); |
893 | vis_mul8x16al(DST_0, CONST_512, TMP16); | |
44f54ceb | 894 | |
bb270c08 DB |
895 | vis_padd16(TMP2, CONST_3, TMP2); |
896 | vis_mul8x16al(DST_1, CONST_512, TMP18); | |
44f54ceb | 897 | |
bb270c08 DB |
898 | vis_padd16(TMP0, TMP4, TMP0); |
899 | vis_mul8x16au(REF_4, CONST_256, TMP8); | |
44f54ceb | 900 | |
bb270c08 DB |
901 | vis_padd16(TMP2, TMP6, TMP2); |
902 | vis_mul8x16au(REF_4_1, CONST_256, TMP10); | |
44f54ceb | 903 | |
bb270c08 DB |
904 | vis_padd16(TMP0, TMP16, TMP0); |
905 | vis_mul8x16au(REF_6, CONST_256, TMP12); | |
44f54ceb | 906 | |
bb270c08 DB |
907 | vis_padd16(TMP2, TMP18, TMP2); |
908 | vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
44f54ceb | 909 | |
bb270c08 DB |
910 | vis_padd16(TMP8, CONST_3, TMP8); |
911 | vis_mul8x16al(DST_2, CONST_512, TMP16); | |
44f54ceb | 912 | |
bb270c08 DB |
913 | vis_padd16(TMP8, TMP12, TMP8); |
914 | vis_mul8x16al(DST_3, CONST_512, TMP18); | |
44f54ceb | 915 | |
bb270c08 DB |
916 | vis_padd16(TMP10, TMP14, TMP10); |
917 | vis_pack16(TMP0, DST_0); | |
44f54ceb | 918 | |
bb270c08 DB |
919 | vis_pack16(TMP2, DST_1); |
920 | vis_st64(DST_0, dest[0]); | |
921 | dest += stride; | |
922 | vis_padd16(TMP10, CONST_3, TMP10); | |
44f54ceb | 923 | |
bb270c08 DB |
924 | vis_ld64_2(dest, stride, DST_0); |
925 | vis_padd16(TMP8, TMP16, TMP8); | |
44f54ceb | 926 | |
bb270c08 DB |
927 | vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/); |
928 | vis_padd16(TMP10, TMP18, TMP10); | |
929 | vis_pack16(TMP8, DST_2); | |
44f54ceb | 930 | |
bb270c08 DB |
931 | vis_pack16(TMP10, DST_3); |
932 | vis_st64(DST_2, dest[0]); | |
933 | dest += stride; | |
44f54ceb | 934 | |
bb270c08 DB |
935 | vis_mul8x16au(REF_S0_1, CONST_256, TMP2); |
936 | vis_pmerge(ZERO, REF_S0, TMP0); | |
44f54ceb | 937 | |
bb270c08 DB |
938 | vis_pmerge(ZERO, REF_S2, TMP24); |
939 | vis_mul8x16au(REF_S2_1, CONST_256, TMP6); | |
44f54ceb | 940 | |
bb270c08 DB |
941 | vis_padd16(TMP0, CONST_3, TMP0); |
942 | vis_mul8x16au(REF_S4, CONST_256, TMP8); | |
44f54ceb | 943 | |
bb270c08 DB |
944 | vis_padd16(TMP2, CONST_3, TMP2); |
945 | vis_mul8x16au(REF_S4_1, CONST_256, TMP10); | |
44f54ceb | 946 | |
bb270c08 DB |
947 | vis_padd16(TMP0, TMP24, TMP0); |
948 | vis_mul8x16au(REF_S6, CONST_256, TMP12); | |
44f54ceb | 949 | |
bb270c08 DB |
950 | vis_padd16(TMP2, TMP6, TMP2); |
951 | vis_mul8x16au(REF_S6_1, CONST_256, TMP14); | |
44f54ceb | 952 | |
bb270c08 DB |
953 | vis_padd16(TMP8, CONST_3, TMP8); |
954 | vis_mul8x16al(DST_0, CONST_512, TMP16); | |
44f54ceb | 955 | |
bb270c08 DB |
956 | vis_padd16(TMP10, CONST_3, TMP10); |
957 | vis_mul8x16al(DST_1, CONST_512, TMP18); | |
44f54ceb | 958 | |
bb270c08 DB |
959 | vis_padd16(TMP8, TMP12, TMP8); |
960 | vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20); | |
44f54ceb | 961 | |
bb270c08 DB |
962 | vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22); |
963 | vis_padd16(TMP0, TMP16, TMP0); | |
44f54ceb | 964 | |
bb270c08 DB |
965 | vis_padd16(TMP2, TMP18, TMP2); |
966 | vis_pack16(TMP0, DST_0); | |
44f54ceb | 967 | |
bb270c08 DB |
968 | vis_padd16(TMP10, TMP14, TMP10); |
969 | vis_pack16(TMP2, DST_1); | |
970 | vis_st64(DST_0, dest[0]); | |
971 | dest += stride; | |
44f54ceb | 972 | |
bb270c08 | 973 | vis_padd16(TMP8, TMP20, TMP8); |
44f54ceb | 974 | |
bb270c08 DB |
975 | vis_padd16(TMP10, TMP22, TMP10); |
976 | vis_pack16(TMP8, DST_2); | |
44f54ceb | 977 | |
bb270c08 DB |
978 | vis_pack16(TMP10, DST_3); |
979 | vis_st64(DST_2, dest[0]); | |
980 | dest += stride; | |
981 | } while (--height); | |
44f54ceb MN |
982 | } |
983 | ||
86decad6 | 984 | static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 985 | const int stride, int height) |
44f54ceb | 986 | { |
bb270c08 DB |
987 | ref = vis_alignaddr(ref); |
988 | vis_ld64(ref[0], TMP0); | |
44f54ceb | 989 | |
bb270c08 | 990 | vis_ld64_2(ref, 8, TMP2); |
44f54ceb | 991 | |
bb270c08 DB |
992 | vis_ld64_2(ref, 16, TMP4); |
993 | ref += stride; | |
44f54ceb | 994 | |
bb270c08 DB |
995 | vis_ld64(ref[0], TMP6); |
996 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 997 | |
bb270c08 DB |
998 | vis_ld64_2(ref, 8, TMP8); |
999 | vis_faligndata(TMP2, TMP4, REF_4); | |
44f54ceb | 1000 | |
bb270c08 DB |
1001 | vis_ld64_2(ref, 16, TMP10); |
1002 | ref += stride; | |
44f54ceb | 1003 | |
bb270c08 DB |
1004 | vis_ld64(constants_fe[0], MASK_fe); |
1005 | vis_faligndata(TMP6, TMP8, REF_2); | |
44f54ceb | 1006 | |
bb270c08 DB |
1007 | vis_ld64(constants_7f[0], MASK_7f); |
1008 | vis_faligndata(TMP8, TMP10, REF_6); | |
44f54ceb | 1009 | |
bb270c08 DB |
1010 | vis_ld64(constants128[0], CONST_128); |
1011 | height = (height >> 1) - 1; | |
1012 | do { /* 24 cycles */ | |
1013 | vis_ld64(ref[0], TMP0); | |
1014 | vis_xor(REF_0, REF_2, TMP12); | |
44f54ceb | 1015 | |
bb270c08 DB |
1016 | vis_ld64_2(ref, 8, TMP2); |
1017 | vis_xor(REF_4, REF_6, TMP16); | |
44f54ceb | 1018 | |
bb270c08 DB |
1019 | vis_ld64_2(ref, 16, TMP4); |
1020 | ref += stride; | |
1021 | vis_or(REF_0, REF_2, TMP14); | |
44f54ceb | 1022 | |
bb270c08 DB |
1023 | vis_ld64(ref[0], TMP6); |
1024 | vis_or(REF_4, REF_6, TMP18); | |
44f54ceb | 1025 | |
bb270c08 DB |
1026 | vis_ld64_2(ref, 8, TMP8); |
1027 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 1028 | |
bb270c08 DB |
1029 | vis_ld64_2(ref, 16, TMP10); |
1030 | ref += stride; | |
1031 | vis_faligndata(TMP2, TMP4, REF_4); | |
44f54ceb | 1032 | |
bb270c08 | 1033 | vis_and(TMP12, MASK_fe, TMP12); |
44f54ceb | 1034 | |
bb270c08 DB |
1035 | vis_and(TMP16, MASK_fe, TMP16); |
1036 | vis_mul8x16(CONST_128, TMP12, TMP12); | |
44f54ceb | 1037 | |
bb270c08 DB |
1038 | vis_mul8x16(CONST_128, TMP16, TMP16); |
1039 | vis_xor(REF_0, REF_2, TMP0); | |
44f54ceb | 1040 | |
bb270c08 | 1041 | vis_xor(REF_4, REF_6, TMP2); |
44f54ceb | 1042 | |
bb270c08 | 1043 | vis_or(REF_0, REF_2, TMP20); |
44f54ceb | 1044 | |
bb270c08 | 1045 | vis_and(TMP12, MASK_7f, TMP12); |
44f54ceb | 1046 | |
bb270c08 | 1047 | vis_and(TMP16, MASK_7f, TMP16); |
44f54ceb | 1048 | |
bb270c08 DB |
1049 | vis_psub16(TMP14, TMP12, TMP12); |
1050 | vis_st64(TMP12, dest[0]); | |
44f54ceb | 1051 | |
bb270c08 DB |
1052 | vis_psub16(TMP18, TMP16, TMP16); |
1053 | vis_st64_2(TMP16, dest, 8); | |
1054 | dest += stride; | |
44f54ceb | 1055 | |
bb270c08 | 1056 | vis_or(REF_4, REF_6, TMP18); |
44f54ceb | 1057 | |
bb270c08 | 1058 | vis_and(TMP0, MASK_fe, TMP0); |
44f54ceb | 1059 | |
bb270c08 DB |
1060 | vis_and(TMP2, MASK_fe, TMP2); |
1061 | vis_mul8x16(CONST_128, TMP0, TMP0); | |
44f54ceb | 1062 | |
bb270c08 DB |
1063 | vis_faligndata(TMP6, TMP8, REF_2); |
1064 | vis_mul8x16(CONST_128, TMP2, TMP2); | |
44f54ceb | 1065 | |
bb270c08 | 1066 | vis_faligndata(TMP8, TMP10, REF_6); |
44f54ceb | 1067 | |
bb270c08 | 1068 | vis_and(TMP0, MASK_7f, TMP0); |
44f54ceb | 1069 | |
bb270c08 | 1070 | vis_and(TMP2, MASK_7f, TMP2); |
44f54ceb | 1071 | |
bb270c08 DB |
1072 | vis_psub16(TMP20, TMP0, TMP0); |
1073 | vis_st64(TMP0, dest[0]); | |
44f54ceb | 1074 | |
bb270c08 DB |
1075 | vis_psub16(TMP18, TMP2, TMP2); |
1076 | vis_st64_2(TMP2, dest, 8); | |
1077 | dest += stride; | |
1078 | } while (--height); | |
44f54ceb | 1079 | |
bb270c08 DB |
1080 | vis_ld64(ref[0], TMP0); |
1081 | vis_xor(REF_0, REF_2, TMP12); | |
44f54ceb | 1082 | |
bb270c08 DB |
1083 | vis_ld64_2(ref, 8, TMP2); |
1084 | vis_xor(REF_4, REF_6, TMP16); | |
44f54ceb | 1085 | |
bb270c08 DB |
1086 | vis_ld64_2(ref, 16, TMP4); |
1087 | vis_or(REF_0, REF_2, TMP14); | |
44f54ceb | 1088 | |
bb270c08 | 1089 | vis_or(REF_4, REF_6, TMP18); |
44f54ceb | 1090 | |
bb270c08 | 1091 | vis_faligndata(TMP0, TMP2, REF_0); |
44f54ceb | 1092 | |
bb270c08 | 1093 | vis_faligndata(TMP2, TMP4, REF_4); |
44f54ceb | 1094 | |
bb270c08 | 1095 | vis_and(TMP12, MASK_fe, TMP12); |
44f54ceb | 1096 | |
bb270c08 DB |
1097 | vis_and(TMP16, MASK_fe, TMP16); |
1098 | vis_mul8x16(CONST_128, TMP12, TMP12); | |
44f54ceb | 1099 | |
bb270c08 DB |
1100 | vis_mul8x16(CONST_128, TMP16, TMP16); |
1101 | vis_xor(REF_0, REF_2, TMP0); | |
44f54ceb | 1102 | |
bb270c08 | 1103 | vis_xor(REF_4, REF_6, TMP2); |
44f54ceb | 1104 | |
bb270c08 | 1105 | vis_or(REF_0, REF_2, TMP20); |
44f54ceb | 1106 | |
bb270c08 | 1107 | vis_and(TMP12, MASK_7f, TMP12); |
44f54ceb | 1108 | |
bb270c08 | 1109 | vis_and(TMP16, MASK_7f, TMP16); |
44f54ceb | 1110 | |
bb270c08 DB |
1111 | vis_psub16(TMP14, TMP12, TMP12); |
1112 | vis_st64(TMP12, dest[0]); | |
44f54ceb | 1113 | |
bb270c08 DB |
1114 | vis_psub16(TMP18, TMP16, TMP16); |
1115 | vis_st64_2(TMP16, dest, 8); | |
1116 | dest += stride; | |
44f54ceb | 1117 | |
bb270c08 | 1118 | vis_or(REF_4, REF_6, TMP18); |
44f54ceb | 1119 | |
bb270c08 | 1120 | vis_and(TMP0, MASK_fe, TMP0); |
44f54ceb | 1121 | |
bb270c08 DB |
1122 | vis_and(TMP2, MASK_fe, TMP2); |
1123 | vis_mul8x16(CONST_128, TMP0, TMP0); | |
44f54ceb | 1124 | |
bb270c08 | 1125 | vis_mul8x16(CONST_128, TMP2, TMP2); |
44f54ceb | 1126 | |
bb270c08 | 1127 | vis_and(TMP0, MASK_7f, TMP0); |
44f54ceb | 1128 | |
bb270c08 | 1129 | vis_and(TMP2, MASK_7f, TMP2); |
44f54ceb | 1130 | |
bb270c08 DB |
1131 | vis_psub16(TMP20, TMP0, TMP0); |
1132 | vis_st64(TMP0, dest[0]); | |
44f54ceb | 1133 | |
bb270c08 DB |
1134 | vis_psub16(TMP18, TMP2, TMP2); |
1135 | vis_st64_2(TMP2, dest, 8); | |
44f54ceb MN |
1136 | } |
1137 | ||
86decad6 | 1138 | static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 1139 | const int stride, int height) |
44f54ceb | 1140 | { |
bb270c08 DB |
1141 | ref = vis_alignaddr(ref); |
1142 | vis_ld64(ref[0], TMP0); | |
44f54ceb | 1143 | |
bb270c08 DB |
1144 | vis_ld64_2(ref, 8, TMP2); |
1145 | ref += stride; | |
44f54ceb | 1146 | |
bb270c08 | 1147 | vis_ld64(ref[0], TMP4); |
44f54ceb | 1148 | |
bb270c08 DB |
1149 | vis_ld64_2(ref, 8, TMP6); |
1150 | ref += stride; | |
44f54ceb | 1151 | |
bb270c08 DB |
1152 | vis_ld64(constants_fe[0], MASK_fe); |
1153 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 1154 | |
bb270c08 DB |
1155 | vis_ld64(constants_7f[0], MASK_7f); |
1156 | vis_faligndata(TMP4, TMP6, REF_2); | |
44f54ceb | 1157 | |
bb270c08 DB |
1158 | vis_ld64(constants128[0], CONST_128); |
1159 | height = (height >> 1) - 1; | |
1160 | do { /* 12 cycles */ | |
1161 | vis_ld64(ref[0], TMP0); | |
1162 | vis_xor(REF_0, REF_2, TMP4); | |
44f54ceb | 1163 | |
bb270c08 DB |
1164 | vis_ld64_2(ref, 8, TMP2); |
1165 | ref += stride; | |
1166 | vis_and(TMP4, MASK_fe, TMP4); | |
44f54ceb | 1167 | |
bb270c08 DB |
1168 | vis_or(REF_0, REF_2, TMP6); |
1169 | vis_mul8x16(CONST_128, TMP4, TMP4); | |
44f54ceb | 1170 | |
bb270c08 DB |
1171 | vis_faligndata(TMP0, TMP2, REF_0); |
1172 | vis_ld64(ref[0], TMP0); | |
44f54ceb | 1173 | |
bb270c08 DB |
1174 | vis_ld64_2(ref, 8, TMP2); |
1175 | ref += stride; | |
1176 | vis_xor(REF_0, REF_2, TMP12); | |
44f54ceb | 1177 | |
bb270c08 | 1178 | vis_and(TMP4, MASK_7f, TMP4); |
44f54ceb | 1179 | |
bb270c08 | 1180 | vis_and(TMP12, MASK_fe, TMP12); |
44f54ceb | 1181 | |
bb270c08 DB |
1182 | vis_mul8x16(CONST_128, TMP12, TMP12); |
1183 | vis_or(REF_0, REF_2, TMP14); | |
44f54ceb | 1184 | |
bb270c08 DB |
1185 | vis_psub16(TMP6, TMP4, DST_0); |
1186 | vis_st64(DST_0, dest[0]); | |
1187 | dest += stride; | |
44f54ceb | 1188 | |
bb270c08 | 1189 | vis_faligndata(TMP0, TMP2, REF_2); |
44f54ceb | 1190 | |
bb270c08 | 1191 | vis_and(TMP12, MASK_7f, TMP12); |
44f54ceb | 1192 | |
bb270c08 DB |
1193 | vis_psub16(TMP14, TMP12, DST_0); |
1194 | vis_st64(DST_0, dest[0]); | |
1195 | dest += stride; | |
1196 | } while (--height); | |
44f54ceb | 1197 | |
bb270c08 DB |
1198 | vis_ld64(ref[0], TMP0); |
1199 | vis_xor(REF_0, REF_2, TMP4); | |
44f54ceb | 1200 | |
bb270c08 DB |
1201 | vis_ld64_2(ref, 8, TMP2); |
1202 | vis_and(TMP4, MASK_fe, TMP4); | |
44f54ceb | 1203 | |
bb270c08 DB |
1204 | vis_or(REF_0, REF_2, TMP6); |
1205 | vis_mul8x16(CONST_128, TMP4, TMP4); | |
44f54ceb | 1206 | |
bb270c08 | 1207 | vis_faligndata(TMP0, TMP2, REF_0); |
44f54ceb | 1208 | |
bb270c08 | 1209 | vis_xor(REF_0, REF_2, TMP12); |
44f54ceb | 1210 | |
bb270c08 | 1211 | vis_and(TMP4, MASK_7f, TMP4); |
44f54ceb | 1212 | |
bb270c08 | 1213 | vis_and(TMP12, MASK_fe, TMP12); |
44f54ceb | 1214 | |
bb270c08 DB |
1215 | vis_mul8x16(CONST_128, TMP12, TMP12); |
1216 | vis_or(REF_0, REF_2, TMP14); | |
44f54ceb | 1217 | |
bb270c08 DB |
1218 | vis_psub16(TMP6, TMP4, DST_0); |
1219 | vis_st64(DST_0, dest[0]); | |
1220 | dest += stride; | |
44f54ceb | 1221 | |
bb270c08 | 1222 | vis_and(TMP12, MASK_7f, TMP12); |
44f54ceb | 1223 | |
bb270c08 DB |
1224 | vis_psub16(TMP14, TMP12, DST_0); |
1225 | vis_st64(DST_0, dest[0]); | |
44f54ceb MN |
1226 | } |
1227 | ||
86decad6 | 1228 | static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 1229 | const int stride, int height) |
44f54ceb | 1230 | { |
bb270c08 DB |
1231 | int stride_8 = stride + 8; |
1232 | int stride_16 = stride + 16; | |
44f54ceb | 1233 | |
bb270c08 | 1234 | vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); |
44f54ceb | 1235 | |
bb270c08 | 1236 | ref = vis_alignaddr(ref); |
44f54ceb | 1237 | |
bb270c08 DB |
1238 | vis_ld64(ref[ 0], TMP0); |
1239 | vis_fzero(ZERO); | |
44f54ceb | 1240 | |
bb270c08 | 1241 | vis_ld64(ref[ 8], TMP2); |
44f54ceb | 1242 | |
bb270c08 | 1243 | vis_ld64(ref[16], TMP4); |
44f54ceb | 1244 | |
bb270c08 DB |
1245 | vis_ld64(constants3[0], CONST_3); |
1246 | vis_faligndata(TMP0, TMP2, REF_2); | |
44f54ceb | 1247 | |
bb270c08 DB |
1248 | vis_ld64(constants256_512[0], CONST_256); |
1249 | vis_faligndata(TMP2, TMP4, REF_6); | |
1250 | height >>= 1; | |
44f54ceb | 1251 | |
bb270c08 DB |
1252 | do { /* 31 cycles */ |
1253 | vis_ld64_2(ref, stride, TMP0); | |
1254 | vis_pmerge(ZERO, REF_2, TMP12); | |
1255 | vis_mul8x16au(REF_2_1, CONST_256, TMP14); | |
44f54ceb | 1256 | |
bb270c08 DB |
1257 | vis_ld64_2(ref, stride_8, TMP2); |
1258 | vis_pmerge(ZERO, REF_6, TMP16); | |
1259 | vis_mul8x16au(REF_6_1, CONST_256, TMP18); | |
44f54ceb | 1260 | |
bb270c08 DB |
1261 | vis_ld64_2(ref, stride_16, TMP4); |
1262 | ref += stride; | |
44f54ceb | 1263 | |
bb270c08 DB |
1264 | vis_ld64(dest[0], DST_0); |
1265 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 1266 | |
bb270c08 DB |
1267 | vis_ld64_2(dest, 8, DST_2); |
1268 | vis_faligndata(TMP2, TMP4, REF_4); | |
44f54ceb | 1269 | |
bb270c08 DB |
1270 | vis_ld64_2(ref, stride, TMP6); |
1271 | vis_pmerge(ZERO, REF_0, TMP0); | |
1272 | vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
44f54ceb | 1273 | |
bb270c08 DB |
1274 | vis_ld64_2(ref, stride_8, TMP8); |
1275 | vis_pmerge(ZERO, REF_4, TMP4); | |
44f54ceb | 1276 | |
bb270c08 DB |
1277 | vis_ld64_2(ref, stride_16, TMP10); |
1278 | ref += stride; | |
44f54ceb | 1279 | |
bb270c08 DB |
1280 | vis_ld64_2(dest, stride, REF_S0/*DST_4*/); |
1281 | vis_faligndata(TMP6, TMP8, REF_2); | |
1282 | vis_mul8x16au(REF_4_1, CONST_256, TMP6); | |
44f54ceb | 1283 | |
bb270c08 DB |
1284 | vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/); |
1285 | vis_faligndata(TMP8, TMP10, REF_6); | |
1286 | vis_mul8x16al(DST_0, CONST_512, TMP20); | |
44f54ceb | 1287 | |
bb270c08 DB |
1288 | vis_padd16(TMP0, CONST_3, TMP0); |
1289 | vis_mul8x16al(DST_1, CONST_512, TMP22); | |
44f54ceb | 1290 | |
bb270c08 DB |
1291 | vis_padd16(TMP2, CONST_3, TMP2); |
1292 | vis_mul8x16al(DST_2, CONST_512, TMP24); | |
44f54ceb | 1293 | |
bb270c08 DB |
1294 | vis_padd16(TMP4, CONST_3, TMP4); |
1295 | vis_mul8x16al(DST_3, CONST_512, TMP26); | |
44f54ceb | 1296 | |
bb270c08 | 1297 | vis_padd16(TMP6, CONST_3, TMP6); |
44f54ceb | 1298 | |
bb270c08 DB |
1299 | vis_padd16(TMP12, TMP20, TMP12); |
1300 | vis_mul8x16al(REF_S0, CONST_512, TMP20); | |
44f54ceb | 1301 | |
bb270c08 DB |
1302 | vis_padd16(TMP14, TMP22, TMP14); |
1303 | vis_mul8x16al(REF_S0_1, CONST_512, TMP22); | |
44f54ceb | 1304 | |
bb270c08 DB |
1305 | vis_padd16(TMP16, TMP24, TMP16); |
1306 | vis_mul8x16al(REF_S2, CONST_512, TMP24); | |
44f54ceb | 1307 | |
bb270c08 DB |
1308 | vis_padd16(TMP18, TMP26, TMP18); |
1309 | vis_mul8x16al(REF_S2_1, CONST_512, TMP26); | |
44f54ceb | 1310 | |
bb270c08 DB |
1311 | vis_padd16(TMP12, TMP0, TMP12); |
1312 | vis_mul8x16au(REF_2, CONST_256, TMP28); | |
44f54ceb | 1313 | |
bb270c08 DB |
1314 | vis_padd16(TMP14, TMP2, TMP14); |
1315 | vis_mul8x16au(REF_2_1, CONST_256, TMP30); | |
44f54ceb | 1316 | |
bb270c08 DB |
1317 | vis_padd16(TMP16, TMP4, TMP16); |
1318 | vis_mul8x16au(REF_6, CONST_256, REF_S4); | |
44f54ceb | 1319 | |
bb270c08 DB |
1320 | vis_padd16(TMP18, TMP6, TMP18); |
1321 | vis_mul8x16au(REF_6_1, CONST_256, REF_S6); | |
44f54ceb | 1322 | |
bb270c08 DB |
1323 | vis_pack16(TMP12, DST_0); |
1324 | vis_padd16(TMP28, TMP0, TMP12); | |
44f54ceb | 1325 | |
bb270c08 DB |
1326 | vis_pack16(TMP14, DST_1); |
1327 | vis_st64(DST_0, dest[0]); | |
1328 | vis_padd16(TMP30, TMP2, TMP14); | |
44f54ceb | 1329 | |
bb270c08 DB |
1330 | vis_pack16(TMP16, DST_2); |
1331 | vis_padd16(REF_S4, TMP4, TMP16); | |
44f54ceb | 1332 | |
bb270c08 DB |
1333 | vis_pack16(TMP18, DST_3); |
1334 | vis_st64_2(DST_2, dest, 8); | |
1335 | dest += stride; | |
1336 | vis_padd16(REF_S6, TMP6, TMP18); | |
44f54ceb | 1337 | |
bb270c08 | 1338 | vis_padd16(TMP12, TMP20, TMP12); |
44f54ceb | 1339 | |
bb270c08 DB |
1340 | vis_padd16(TMP14, TMP22, TMP14); |
1341 | vis_pack16(TMP12, DST_0); | |
44f54ceb | 1342 | |
bb270c08 DB |
1343 | vis_padd16(TMP16, TMP24, TMP16); |
1344 | vis_pack16(TMP14, DST_1); | |
1345 | vis_st64(DST_0, dest[0]); | |
44f54ceb | 1346 | |
bb270c08 DB |
1347 | vis_padd16(TMP18, TMP26, TMP18); |
1348 | vis_pack16(TMP16, DST_2); | |
44f54ceb | 1349 | |
bb270c08 DB |
1350 | vis_pack16(TMP18, DST_3); |
1351 | vis_st64_2(DST_2, dest, 8); | |
1352 | dest += stride; | |
1353 | } while (--height); | |
44f54ceb MN |
1354 | } |
1355 | ||
86decad6 | 1356 | static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 1357 | const int stride, int height) |
44f54ceb | 1358 | { |
bb270c08 | 1359 | int stride_8 = stride + 8; |
44f54ceb | 1360 | |
bb270c08 | 1361 | vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); |
44f54ceb | 1362 | |
bb270c08 | 1363 | ref = vis_alignaddr(ref); |
44f54ceb | 1364 | |
bb270c08 DB |
1365 | vis_ld64(ref[ 0], TMP0); |
1366 | vis_fzero(ZERO); | |
44f54ceb | 1367 | |
bb270c08 | 1368 | vis_ld64(ref[ 8], TMP2); |
44f54ceb | 1369 | |
bb270c08 DB |
1370 | vis_ld64(constants3[0], CONST_3); |
1371 | vis_faligndata(TMP0, TMP2, REF_2); | |
44f54ceb | 1372 | |
bb270c08 | 1373 | vis_ld64(constants256_512[0], CONST_256); |
44f54ceb | 1374 | |
bb270c08 DB |
1375 | height >>= 1; |
1376 | do { /* 20 cycles */ | |
1377 | vis_ld64_2(ref, stride, TMP0); | |
1378 | vis_pmerge(ZERO, REF_2, TMP8); | |
1379 | vis_mul8x16au(REF_2_1, CONST_256, TMP10); | |
44f54ceb | 1380 | |
bb270c08 DB |
1381 | vis_ld64_2(ref, stride_8, TMP2); |
1382 | ref += stride; | |
44f54ceb | 1383 | |
bb270c08 | 1384 | vis_ld64(dest[0], DST_0); |
44f54ceb | 1385 | |
bb270c08 DB |
1386 | vis_ld64_2(dest, stride, DST_2); |
1387 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 1388 | |
bb270c08 DB |
1389 | vis_ld64_2(ref, stride, TMP4); |
1390 | vis_mul8x16al(DST_0, CONST_512, TMP16); | |
1391 | vis_pmerge(ZERO, REF_0, TMP12); | |
44f54ceb | 1392 | |
bb270c08 DB |
1393 | vis_ld64_2(ref, stride_8, TMP6); |
1394 | ref += stride; | |
1395 | vis_mul8x16al(DST_1, CONST_512, TMP18); | |
1396 | vis_pmerge(ZERO, REF_0_1, TMP14); | |
44f54ceb | 1397 | |
bb270c08 DB |
1398 | vis_padd16(TMP12, CONST_3, TMP12); |
1399 | vis_mul8x16al(DST_2, CONST_512, TMP24); | |
44f54ceb | 1400 | |
bb270c08 DB |
1401 | vis_padd16(TMP14, CONST_3, TMP14); |
1402 | vis_mul8x16al(DST_3, CONST_512, TMP26); | |
44f54ceb | 1403 | |
bb270c08 | 1404 | vis_faligndata(TMP4, TMP6, REF_2); |
44f54ceb | 1405 | |
bb270c08 | 1406 | vis_padd16(TMP8, TMP12, TMP8); |
44f54ceb | 1407 | |
bb270c08 DB |
1408 | vis_padd16(TMP10, TMP14, TMP10); |
1409 | vis_mul8x16au(REF_2, CONST_256, TMP20); | |
44f54ceb | 1410 | |
bb270c08 DB |
1411 | vis_padd16(TMP8, TMP16, TMP0); |
1412 | vis_mul8x16au(REF_2_1, CONST_256, TMP22); | |
44f54ceb | 1413 | |
bb270c08 DB |
1414 | vis_padd16(TMP10, TMP18, TMP2); |
1415 | vis_pack16(TMP0, DST_0); | |
44f54ceb | 1416 | |
bb270c08 DB |
1417 | vis_pack16(TMP2, DST_1); |
1418 | vis_st64(DST_0, dest[0]); | |
1419 | dest += stride; | |
1420 | vis_padd16(TMP12, TMP20, TMP12); | |
44f54ceb | 1421 | |
bb270c08 | 1422 | vis_padd16(TMP14, TMP22, TMP14); |
44f54ceb | 1423 | |
bb270c08 | 1424 | vis_padd16(TMP12, TMP24, TMP0); |
44f54ceb | 1425 | |
bb270c08 DB |
1426 | vis_padd16(TMP14, TMP26, TMP2); |
1427 | vis_pack16(TMP0, DST_2); | |
44f54ceb | 1428 | |
bb270c08 DB |
1429 | vis_pack16(TMP2, DST_3); |
1430 | vis_st64(DST_2, dest[0]); | |
1431 | dest += stride; | |
1432 | } while (--height); | |
44f54ceb MN |
1433 | } |
1434 | ||
86decad6 | 1435 | static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 1436 | const int stride, int height) |
44f54ceb | 1437 | { |
bb270c08 DB |
1438 | unsigned long off = (unsigned long) ref & 0x7; |
1439 | unsigned long off_plus_1 = off + 1; | |
1440 | int stride_8 = stride + 8; | |
1441 | int stride_16 = stride + 16; | |
44f54ceb | 1442 | |
bb270c08 | 1443 | vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); |
44f54ceb | 1444 | |
bb270c08 | 1445 | ref = vis_alignaddr(ref); |
44f54ceb | 1446 | |
bb270c08 DB |
1447 | vis_ld64(ref[ 0], TMP0); |
1448 | vis_fzero(ZERO); | |
44f54ceb | 1449 | |
bb270c08 | 1450 | vis_ld64(ref[ 8], TMP2); |
44f54ceb | 1451 | |
bb270c08 | 1452 | vis_ld64(ref[16], TMP4); |
44f54ceb | 1453 | |
bb270c08 DB |
1454 | vis_ld64(constants2[0], CONST_2); |
1455 | vis_faligndata(TMP0, TMP2, REF_S0); | |
44f54ceb | 1456 | |
bb270c08 DB |
1457 | vis_ld64(constants256_512[0], CONST_256); |
1458 | vis_faligndata(TMP2, TMP4, REF_S4); | |
44f54ceb | 1459 | |
bb270c08 DB |
1460 | if (off != 0x7) { |
1461 | vis_alignaddr_g0((void *)off_plus_1); | |
1462 | vis_faligndata(TMP0, TMP2, REF_S2); | |
1463 | vis_faligndata(TMP2, TMP4, REF_S6); | |
1464 | } else { | |
1465 | vis_src1(TMP2, REF_S2); | |
1466 | vis_src1(TMP4, REF_S6); | |
1467 | } | |
44f54ceb | 1468 | |
bb270c08 DB |
1469 | height >>= 1; |
1470 | do { | |
1471 | vis_ld64_2(ref, stride, TMP0); | |
1472 | vis_mul8x16au(REF_S0, CONST_256, TMP12); | |
1473 | vis_pmerge(ZERO, REF_S0_1, TMP14); | |
44f54ceb | 1474 | |
bb270c08 | 1475 | vis_alignaddr_g0((void *)off); |
44f54ceb | 1476 | |
bb270c08 DB |
1477 | vis_ld64_2(ref, stride_8, TMP2); |
1478 | vis_mul8x16au(REF_S2, CONST_256, TMP16); | |
1479 | vis_pmerge(ZERO, REF_S2_1, TMP18); | |
44f54ceb | 1480 | |
bb270c08 DB |
1481 | vis_ld64_2(ref, stride_16, TMP4); |
1482 | ref += stride; | |
1483 | vis_mul8x16au(REF_S4, CONST_256, TMP20); | |
1484 | vis_pmerge(ZERO, REF_S4_1, TMP22); | |
44f54ceb | 1485 | |
bb270c08 DB |
1486 | vis_ld64_2(ref, stride, TMP6); |
1487 | vis_mul8x16au(REF_S6, CONST_256, TMP24); | |
1488 | vis_pmerge(ZERO, REF_S6_1, TMP26); | |
44f54ceb | 1489 | |
bb270c08 DB |
1490 | vis_ld64_2(ref, stride_8, TMP8); |
1491 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 1492 | |
bb270c08 DB |
1493 | vis_ld64_2(ref, stride_16, TMP10); |
1494 | ref += stride; | |
1495 | vis_faligndata(TMP2, TMP4, REF_4); | |
44f54ceb | 1496 | |
bb270c08 | 1497 | vis_faligndata(TMP6, TMP8, REF_S0); |
44f54ceb | 1498 | |
bb270c08 | 1499 | vis_faligndata(TMP8, TMP10, REF_S4); |
44f54ceb | 1500 | |
bb270c08 DB |
1501 | if (off != 0x7) { |
1502 | vis_alignaddr_g0((void *)off_plus_1); | |
1503 | vis_faligndata(TMP0, TMP2, REF_2); | |
1504 | vis_faligndata(TMP2, TMP4, REF_6); | |
1505 | vis_faligndata(TMP6, TMP8, REF_S2); | |
1506 | vis_faligndata(TMP8, TMP10, REF_S6); | |
1507 | } else { | |
1508 | vis_src1(TMP2, REF_2); | |
1509 | vis_src1(TMP4, REF_6); | |
1510 | vis_src1(TMP8, REF_S2); | |
1511 | vis_src1(TMP10, REF_S6); | |
1512 | } | |
44f54ceb | 1513 | |
bb270c08 DB |
1514 | vis_mul8x16au(REF_0, CONST_256, TMP0); |
1515 | vis_pmerge(ZERO, REF_0_1, TMP2); | |
44f54ceb | 1516 | |
bb270c08 DB |
1517 | vis_mul8x16au(REF_2, CONST_256, TMP4); |
1518 | vis_pmerge(ZERO, REF_2_1, TMP6); | |
44f54ceb | 1519 | |
bb270c08 DB |
1520 | vis_padd16(TMP0, CONST_2, TMP8); |
1521 | vis_mul8x16au(REF_4, CONST_256, TMP0); | |
44f54ceb | 1522 | |
bb270c08 DB |
1523 | vis_padd16(TMP2, CONST_2, TMP10); |
1524 | vis_mul8x16au(REF_4_1, CONST_256, TMP2); | |
44f54ceb | 1525 | |
bb270c08 DB |
1526 | vis_padd16(TMP8, TMP4, TMP8); |
1527 | vis_mul8x16au(REF_6, CONST_256, TMP4); | |
44f54ceb | 1528 | |
bb270c08 DB |
1529 | vis_padd16(TMP10, TMP6, TMP10); |
1530 | vis_mul8x16au(REF_6_1, CONST_256, TMP6); | |
44f54ceb | 1531 | |
bb270c08 | 1532 | vis_padd16(TMP12, TMP8, TMP12); |
44f54ceb | 1533 | |
bb270c08 | 1534 | vis_padd16(TMP14, TMP10, TMP14); |
44f54ceb | 1535 | |
bb270c08 | 1536 | vis_padd16(TMP12, TMP16, TMP12); |
44f54ceb | 1537 | |
bb270c08 DB |
1538 | vis_padd16(TMP14, TMP18, TMP14); |
1539 | vis_pack16(TMP12, DST_0); | |
44f54ceb | 1540 | |
bb270c08 DB |
1541 | vis_pack16(TMP14, DST_1); |
1542 | vis_st64(DST_0, dest[0]); | |
1543 | vis_padd16(TMP0, CONST_2, TMP12); | |
44f54ceb | 1544 | |
bb270c08 DB |
1545 | vis_mul8x16au(REF_S0, CONST_256, TMP0); |
1546 | vis_padd16(TMP2, CONST_2, TMP14); | |
44f54ceb | 1547 | |
bb270c08 DB |
1548 | vis_mul8x16au(REF_S0_1, CONST_256, TMP2); |
1549 | vis_padd16(TMP12, TMP4, TMP12); | |
44f54ceb | 1550 | |
bb270c08 DB |
1551 | vis_mul8x16au(REF_S2, CONST_256, TMP4); |
1552 | vis_padd16(TMP14, TMP6, TMP14); | |
44f54ceb | 1553 | |
bb270c08 DB |
1554 | vis_mul8x16au(REF_S2_1, CONST_256, TMP6); |
1555 | vis_padd16(TMP20, TMP12, TMP20); | |
44f54ceb | 1556 | |
bb270c08 | 1557 | vis_padd16(TMP22, TMP14, TMP22); |
44f54ceb | 1558 | |
bb270c08 | 1559 | vis_padd16(TMP20, TMP24, TMP20); |
44f54ceb | 1560 | |
bb270c08 DB |
1561 | vis_padd16(TMP22, TMP26, TMP22); |
1562 | vis_pack16(TMP20, DST_2); | |
44f54ceb | 1563 | |
bb270c08 DB |
1564 | vis_pack16(TMP22, DST_3); |
1565 | vis_st64_2(DST_2, dest, 8); | |
1566 | dest += stride; | |
1567 | vis_padd16(TMP0, TMP4, TMP24); | |
44f54ceb | 1568 | |
bb270c08 DB |
1569 | vis_mul8x16au(REF_S4, CONST_256, TMP0); |
1570 | vis_padd16(TMP2, TMP6, TMP26); | |
44f54ceb | 1571 | |
bb270c08 DB |
1572 | vis_mul8x16au(REF_S4_1, CONST_256, TMP2); |
1573 | vis_padd16(TMP24, TMP8, TMP24); | |
44f54ceb | 1574 | |
bb270c08 DB |
1575 | vis_padd16(TMP26, TMP10, TMP26); |
1576 | vis_pack16(TMP24, DST_0); | |
44f54ceb | 1577 | |
bb270c08 DB |
1578 | vis_pack16(TMP26, DST_1); |
1579 | vis_st64(DST_0, dest[0]); | |
1580 | vis_pmerge(ZERO, REF_S6, TMP4); | |
44f54ceb | 1581 | |
bb270c08 | 1582 | vis_pmerge(ZERO, REF_S6_1, TMP6); |
44f54ceb | 1583 | |
bb270c08 | 1584 | vis_padd16(TMP0, TMP4, TMP0); |
44f54ceb | 1585 | |
bb270c08 | 1586 | vis_padd16(TMP2, TMP6, TMP2); |
44f54ceb | 1587 | |
bb270c08 | 1588 | vis_padd16(TMP0, TMP12, TMP0); |
44f54ceb | 1589 | |
bb270c08 DB |
1590 | vis_padd16(TMP2, TMP14, TMP2); |
1591 | vis_pack16(TMP0, DST_2); | |
44f54ceb | 1592 | |
bb270c08 DB |
1593 | vis_pack16(TMP2, DST_3); |
1594 | vis_st64_2(DST_2, dest, 8); | |
1595 | dest += stride; | |
1596 | } while (--height); | |
44f54ceb MN |
1597 | } |
1598 | ||
86decad6 | 1599 | static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 1600 | const int stride, int height) |
44f54ceb | 1601 | { |
bb270c08 DB |
1602 | unsigned long off = (unsigned long) ref & 0x7; |
1603 | unsigned long off_plus_1 = off + 1; | |
1604 | int stride_8 = stride + 8; | |
44f54ceb | 1605 | |
bb270c08 | 1606 | vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); |
44f54ceb | 1607 | |
bb270c08 | 1608 | ref = vis_alignaddr(ref); |
44f54ceb | 1609 | |
bb270c08 DB |
1610 | vis_ld64(ref[ 0], TMP0); |
1611 | vis_fzero(ZERO); | |
44f54ceb | 1612 | |
bb270c08 | 1613 | vis_ld64(ref[ 8], TMP2); |
44f54ceb | 1614 | |
bb270c08 | 1615 | vis_ld64(constants2[0], CONST_2); |
44f54ceb | 1616 | |
bb270c08 DB |
1617 | vis_ld64(constants256_512[0], CONST_256); |
1618 | vis_faligndata(TMP0, TMP2, REF_S0); | |
44f54ceb | 1619 | |
bb270c08 DB |
1620 | if (off != 0x7) { |
1621 | vis_alignaddr_g0((void *)off_plus_1); | |
1622 | vis_faligndata(TMP0, TMP2, REF_S2); | |
1623 | } else { | |
1624 | vis_src1(TMP2, REF_S2); | |
1625 | } | |
44f54ceb | 1626 | |
bb270c08 DB |
1627 | height >>= 1; |
1628 | do { /* 26 cycles */ | |
1629 | vis_ld64_2(ref, stride, TMP0); | |
1630 | vis_mul8x16au(REF_S0, CONST_256, TMP8); | |
1631 | vis_pmerge(ZERO, REF_S2, TMP12); | |
44f54ceb | 1632 | |
bb270c08 | 1633 | vis_alignaddr_g0((void *)off); |
44f54ceb | 1634 | |
bb270c08 DB |
1635 | vis_ld64_2(ref, stride_8, TMP2); |
1636 | ref += stride; | |
1637 | vis_mul8x16au(REF_S0_1, CONST_256, TMP10); | |
1638 | vis_pmerge(ZERO, REF_S2_1, TMP14); | |
44f54ceb | 1639 | |
bb270c08 | 1640 | vis_ld64_2(ref, stride, TMP4); |
44f54ceb | 1641 | |
bb270c08 DB |
1642 | vis_ld64_2(ref, stride_8, TMP6); |
1643 | ref += stride; | |
1644 | vis_faligndata(TMP0, TMP2, REF_S4); | |
44f54ceb | 1645 | |
bb270c08 | 1646 | vis_pmerge(ZERO, REF_S4, TMP18); |
44f54ceb | 1647 | |
bb270c08 | 1648 | vis_pmerge(ZERO, REF_S4_1, TMP20); |
44f54ceb | 1649 | |
bb270c08 | 1650 | vis_faligndata(TMP4, TMP6, REF_S0); |
44f54ceb | 1651 | |
bb270c08 DB |
1652 | if (off != 0x7) { |
1653 | vis_alignaddr_g0((void *)off_plus_1); | |
1654 | vis_faligndata(TMP0, TMP2, REF_S6); | |
1655 | vis_faligndata(TMP4, TMP6, REF_S2); | |
1656 | } else { | |
1657 | vis_src1(TMP2, REF_S6); | |
1658 | vis_src1(TMP6, REF_S2); | |
1659 | } | |
44f54ceb | 1660 | |
bb270c08 DB |
1661 | vis_padd16(TMP18, CONST_2, TMP18); |
1662 | vis_mul8x16au(REF_S6, CONST_256, TMP22); | |
44f54ceb | 1663 | |
bb270c08 DB |
1664 | vis_padd16(TMP20, CONST_2, TMP20); |
1665 | vis_mul8x16au(REF_S6_1, CONST_256, TMP24); | |
44f54ceb | 1666 | |
bb270c08 DB |
1667 | vis_mul8x16au(REF_S0, CONST_256, TMP26); |
1668 | vis_pmerge(ZERO, REF_S0_1, TMP28); | |
44f54ceb | 1669 | |
bb270c08 DB |
1670 | vis_mul8x16au(REF_S2, CONST_256, TMP30); |
1671 | vis_padd16(TMP18, TMP22, TMP18); | |
44f54ceb | 1672 | |
bb270c08 DB |
1673 | vis_mul8x16au(REF_S2_1, CONST_256, TMP32); |
1674 | vis_padd16(TMP20, TMP24, TMP20); | |
44f54ceb | 1675 | |
bb270c08 | 1676 | vis_padd16(TMP8, TMP18, TMP8); |
44f54ceb | 1677 | |
bb270c08 | 1678 | vis_padd16(TMP10, TMP20, TMP10); |
44f54ceb | 1679 | |
bb270c08 | 1680 | vis_padd16(TMP8, TMP12, TMP8); |
44f54ceb | 1681 | |
bb270c08 DB |
1682 | vis_padd16(TMP10, TMP14, TMP10); |
1683 | vis_pack16(TMP8, DST_0); | |
44f54ceb | 1684 | |
bb270c08 DB |
1685 | vis_pack16(TMP10, DST_1); |
1686 | vis_st64(DST_0, dest[0]); | |
1687 | dest += stride; | |
1688 | vis_padd16(TMP18, TMP26, TMP18); | |
44f54ceb | 1689 | |
bb270c08 | 1690 | vis_padd16(TMP20, TMP28, TMP20); |
44f54ceb | 1691 | |
bb270c08 | 1692 | vis_padd16(TMP18, TMP30, TMP18); |
44f54ceb | 1693 | |
bb270c08 DB |
1694 | vis_padd16(TMP20, TMP32, TMP20); |
1695 | vis_pack16(TMP18, DST_2); | |
44f54ceb | 1696 | |
bb270c08 DB |
1697 | vis_pack16(TMP20, DST_3); |
1698 | vis_st64(DST_2, dest[0]); | |
1699 | dest += stride; | |
1700 | } while (--height); | |
44f54ceb MN |
1701 | } |
1702 | ||
86decad6 | 1703 | static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 1704 | const int stride, int height) |
44f54ceb | 1705 | { |
bb270c08 DB |
1706 | unsigned long off = (unsigned long) ref & 0x7; |
1707 | unsigned long off_plus_1 = off + 1; | |
1708 | int stride_8 = stride + 8; | |
1709 | int stride_16 = stride + 16; | |
44f54ceb | 1710 | |
bb270c08 | 1711 | vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); |
44f54ceb | 1712 | |
bb270c08 | 1713 | ref = vis_alignaddr(ref); |
44f54ceb | 1714 | |
bb270c08 DB |
1715 | vis_ld64(ref[ 0], TMP0); |
1716 | vis_fzero(ZERO); | |
44f54ceb | 1717 | |
bb270c08 | 1718 | vis_ld64(ref[ 8], TMP2); |
44f54ceb | 1719 | |
bb270c08 | 1720 | vis_ld64(ref[16], TMP4); |
44f54ceb | 1721 | |
bb270c08 DB |
1722 | vis_ld64(constants6[0], CONST_6); |
1723 | vis_faligndata(TMP0, TMP2, REF_S0); | |
44f54ceb | 1724 | |
bb270c08 DB |
1725 | vis_ld64(constants256_1024[0], CONST_256); |
1726 | vis_faligndata(TMP2, TMP4, REF_S4); | |
44f54ceb | 1727 | |
bb270c08 DB |
1728 | if (off != 0x7) { |
1729 | vis_alignaddr_g0((void *)off_plus_1); | |
1730 | vis_faligndata(TMP0, TMP2, REF_S2); | |
1731 | vis_faligndata(TMP2, TMP4, REF_S6); | |
1732 | } else { | |
1733 | vis_src1(TMP2, REF_S2); | |
1734 | vis_src1(TMP4, REF_S6); | |
1735 | } | |
44f54ceb | 1736 | |
bb270c08 DB |
1737 | height >>= 1; |
1738 | do { /* 55 cycles */ | |
1739 | vis_ld64_2(ref, stride, TMP0); | |
1740 | vis_mul8x16au(REF_S0, CONST_256, TMP12); | |
1741 | vis_pmerge(ZERO, REF_S0_1, TMP14); | |
44f54ceb | 1742 | |
bb270c08 | 1743 | vis_alignaddr_g0((void *)off); |
44f54ceb | 1744 | |
bb270c08 DB |
1745 | vis_ld64_2(ref, stride_8, TMP2); |
1746 | vis_mul8x16au(REF_S2, CONST_256, TMP16); | |
1747 | vis_pmerge(ZERO, REF_S2_1, TMP18); | |
44f54ceb | 1748 | |
bb270c08 DB |
1749 | vis_ld64_2(ref, stride_16, TMP4); |
1750 | ref += stride; | |
1751 | vis_mul8x16au(REF_S4, CONST_256, TMP20); | |
1752 | vis_pmerge(ZERO, REF_S4_1, TMP22); | |
44f54ceb | 1753 | |
bb270c08 DB |
1754 | vis_ld64_2(ref, stride, TMP6); |
1755 | vis_mul8x16au(REF_S6, CONST_256, TMP24); | |
1756 | vis_pmerge(ZERO, REF_S6_1, TMP26); | |
44f54ceb | 1757 | |
bb270c08 DB |
1758 | vis_ld64_2(ref, stride_8, TMP8); |
1759 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 1760 | |
bb270c08 DB |
1761 | vis_ld64_2(ref, stride_16, TMP10); |
1762 | ref += stride; | |
1763 | vis_faligndata(TMP2, TMP4, REF_4); | |
44f54ceb | 1764 | |
bb270c08 DB |
1765 | vis_ld64(dest[0], DST_0); |
1766 | vis_faligndata(TMP6, TMP8, REF_S0); | |
44f54ceb | 1767 | |
bb270c08 DB |
1768 | vis_ld64_2(dest, 8, DST_2); |
1769 | vis_faligndata(TMP8, TMP10, REF_S4); | |
44f54ceb | 1770 | |
bb270c08 DB |
1771 | if (off != 0x7) { |
1772 | vis_alignaddr_g0((void *)off_plus_1); | |
1773 | vis_faligndata(TMP0, TMP2, REF_2); | |
1774 | vis_faligndata(TMP2, TMP4, REF_6); | |
1775 | vis_faligndata(TMP6, TMP8, REF_S2); | |
1776 | vis_faligndata(TMP8, TMP10, REF_S6); | |
1777 | } else { | |
1778 | vis_src1(TMP2, REF_2); | |
1779 | vis_src1(TMP4, REF_6); | |
1780 | vis_src1(TMP8, REF_S2); | |
1781 | vis_src1(TMP10, REF_S6); | |
1782 | } | |
44f54ceb | 1783 | |
bb270c08 DB |
1784 | vis_mul8x16al(DST_0, CONST_1024, TMP30); |
1785 | vis_pmerge(ZERO, REF_0, TMP0); | |
44f54ceb | 1786 | |
bb270c08 DB |
1787 | vis_mul8x16al(DST_1, CONST_1024, TMP32); |
1788 | vis_pmerge(ZERO, REF_0_1, TMP2); | |
44f54ceb | 1789 | |
bb270c08 DB |
1790 | vis_mul8x16au(REF_2, CONST_256, TMP4); |
1791 | vis_pmerge(ZERO, REF_2_1, TMP6); | |
44f54ceb | 1792 | |
bb270c08 DB |
1793 | vis_mul8x16al(DST_2, CONST_1024, REF_0); |
1794 | vis_padd16(TMP0, CONST_6, TMP0); | |
44f54ceb | 1795 | |
bb270c08 DB |
1796 | vis_mul8x16al(DST_3, CONST_1024, REF_2); |
1797 | vis_padd16(TMP2, CONST_6, TMP2); | |
44f54ceb | 1798 | |
bb270c08 DB |
1799 | vis_padd16(TMP0, TMP4, TMP0); |
1800 | vis_mul8x16au(REF_4, CONST_256, TMP4); | |
44f54ceb | 1801 | |
bb270c08 DB |
1802 | vis_padd16(TMP2, TMP6, TMP2); |
1803 | vis_mul8x16au(REF_4_1, CONST_256, TMP6); | |
44f54ceb | 1804 | |
bb270c08 DB |
1805 | vis_padd16(TMP12, TMP0, TMP12); |
1806 | vis_mul8x16au(REF_6, CONST_256, TMP8); | |
44f54ceb | 1807 | |
bb270c08 DB |
1808 | vis_padd16(TMP14, TMP2, TMP14); |
1809 | vis_mul8x16au(REF_6_1, CONST_256, TMP10); | |
44f54ceb | 1810 | |
bb270c08 DB |
1811 | vis_padd16(TMP12, TMP16, TMP12); |
1812 | vis_mul8x16au(REF_S0, CONST_256, REF_4); | |
44f54ceb | 1813 | |
bb270c08 DB |
1814 | vis_padd16(TMP14, TMP18, TMP14); |
1815 | vis_mul8x16au(REF_S0_1, CONST_256, REF_6); | |
44f54ceb | 1816 | |
bb270c08 | 1817 | vis_padd16(TMP12, TMP30, TMP12); |
44f54ceb | 1818 | |
bb270c08 DB |
1819 | vis_padd16(TMP14, TMP32, TMP14); |
1820 | vis_pack16(TMP12, DST_0); | |
44f54ceb | 1821 | |
bb270c08 DB |
1822 | vis_pack16(TMP14, DST_1); |
1823 | vis_st64(DST_0, dest[0]); | |
1824 | vis_padd16(TMP4, CONST_6, TMP4); | |
44f54ceb | 1825 | |
bb270c08 DB |
1826 | vis_ld64_2(dest, stride, DST_0); |
1827 | vis_padd16(TMP6, CONST_6, TMP6); | |
1828 | vis_mul8x16au(REF_S2, CONST_256, TMP12); | |
44f54ceb | 1829 | |
bb270c08 DB |
1830 | vis_padd16(TMP4, TMP8, TMP4); |
1831 | vis_mul8x16au(REF_S2_1, CONST_256, TMP14); | |
44f54ceb | 1832 | |
bb270c08 | 1833 | vis_padd16(TMP6, TMP10, TMP6); |
44f54ceb | 1834 | |
bb270c08 | 1835 | vis_padd16(TMP20, TMP4, TMP20); |
44f54ceb | 1836 | |
bb270c08 | 1837 | vis_padd16(TMP22, TMP6, TMP22); |
44f54ceb | 1838 | |
bb270c08 | 1839 | vis_padd16(TMP20, TMP24, TMP20); |
44f54ceb | 1840 | |
bb270c08 | 1841 | vis_padd16(TMP22, TMP26, TMP22); |
44f54ceb | 1842 | |
bb270c08 DB |
1843 | vis_padd16(TMP20, REF_0, TMP20); |
1844 | vis_mul8x16au(REF_S4, CONST_256, REF_0); | |
44f54ceb | 1845 | |
bb270c08 DB |
1846 | vis_padd16(TMP22, REF_2, TMP22); |
1847 | vis_pack16(TMP20, DST_2); | |
44f54ceb | 1848 | |
bb270c08 DB |
1849 | vis_pack16(TMP22, DST_3); |
1850 | vis_st64_2(DST_2, dest, 8); | |
1851 | dest += stride; | |
44f54ceb | 1852 | |
bb270c08 DB |
1853 | vis_ld64_2(dest, 8, DST_2); |
1854 | vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
1855 | vis_pmerge(ZERO, REF_S4_1, REF_2); | |
44f54ceb | 1856 | |
bb270c08 DB |
1857 | vis_mul8x16al(DST_1, CONST_1024, TMP32); |
1858 | vis_padd16(REF_4, TMP0, TMP8); | |
44f54ceb | 1859 | |
bb270c08 DB |
1860 | vis_mul8x16au(REF_S6, CONST_256, REF_4); |
1861 | vis_padd16(REF_6, TMP2, TMP10); | |
44f54ceb | 1862 | |
bb270c08 DB |
1863 | vis_mul8x16au(REF_S6_1, CONST_256, REF_6); |
1864 | vis_padd16(TMP8, TMP12, TMP8); | |
44f54ceb | 1865 | |
bb270c08 | 1866 | vis_padd16(TMP10, TMP14, TMP10); |
44f54ceb | 1867 | |
bb270c08 | 1868 | vis_padd16(TMP8, TMP30, TMP8); |
44f54ceb | 1869 | |
bb270c08 DB |
1870 | vis_padd16(TMP10, TMP32, TMP10); |
1871 | vis_pack16(TMP8, DST_0); | |
44f54ceb | 1872 | |
bb270c08 DB |
1873 | vis_pack16(TMP10, DST_1); |
1874 | vis_st64(DST_0, dest[0]); | |
44f54ceb | 1875 | |
bb270c08 | 1876 | vis_padd16(REF_0, TMP4, REF_0); |
44f54ceb | 1877 | |
bb270c08 DB |
1878 | vis_mul8x16al(DST_2, CONST_1024, TMP30); |
1879 | vis_padd16(REF_2, TMP6, REF_2); | |
44f54ceb | 1880 | |
bb270c08 DB |
1881 | vis_mul8x16al(DST_3, CONST_1024, TMP32); |
1882 | vis_padd16(REF_0, REF_4, REF_0); | |
44f54ceb | 1883 | |
bb270c08 | 1884 | vis_padd16(REF_2, REF_6, REF_2); |
44f54ceb | 1885 | |
bb270c08 | 1886 | vis_padd16(REF_0, TMP30, REF_0); |
44f54ceb | 1887 | |
bb270c08 | 1888 | /* stall */ |
44f54ceb | 1889 | |
bb270c08 DB |
1890 | vis_padd16(REF_2, TMP32, REF_2); |
1891 | vis_pack16(REF_0, DST_2); | |
44f54ceb | 1892 | |
bb270c08 DB |
1893 | vis_pack16(REF_2, DST_3); |
1894 | vis_st64_2(DST_2, dest, 8); | |
1895 | dest += stride; | |
1896 | } while (--height); | |
44f54ceb MN |
1897 | } |
1898 | ||
86decad6 | 1899 | static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 1900 | const int stride, int height) |
44f54ceb | 1901 | { |
bb270c08 DB |
1902 | unsigned long off = (unsigned long) ref & 0x7; |
1903 | unsigned long off_plus_1 = off + 1; | |
1904 | int stride_8 = stride + 8; | |
44f54ceb | 1905 | |
bb270c08 | 1906 | vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); |
44f54ceb | 1907 | |
bb270c08 | 1908 | ref = vis_alignaddr(ref); |
44f54ceb | 1909 | |
bb270c08 DB |
1910 | vis_ld64(ref[0], TMP0); |
1911 | vis_fzero(ZERO); | |
44f54ceb | 1912 | |
bb270c08 | 1913 | vis_ld64_2(ref, 8, TMP2); |
44f54ceb | 1914 | |
bb270c08 | 1915 | vis_ld64(constants6[0], CONST_6); |
44f54ceb | 1916 | |
bb270c08 DB |
1917 | vis_ld64(constants256_1024[0], CONST_256); |
1918 | vis_faligndata(TMP0, TMP2, REF_S0); | |
44f54ceb | 1919 | |
bb270c08 DB |
1920 | if (off != 0x7) { |
1921 | vis_alignaddr_g0((void *)off_plus_1); | |
1922 | vis_faligndata(TMP0, TMP2, REF_S2); | |
1923 | } else { | |
1924 | vis_src1(TMP2, REF_S2); | |
1925 | } | |
44f54ceb | 1926 | |
bb270c08 DB |
1927 | height >>= 1; |
1928 | do { /* 31 cycles */ | |
1929 | vis_ld64_2(ref, stride, TMP0); | |
1930 | vis_mul8x16au(REF_S0, CONST_256, TMP8); | |
1931 | vis_pmerge(ZERO, REF_S0_1, TMP10); | |
44f54ceb | 1932 | |
bb270c08 DB |
1933 | vis_ld64_2(ref, stride_8, TMP2); |
1934 | ref += stride; | |
1935 | vis_mul8x16au(REF_S2, CONST_256, TMP12); | |
1936 | vis_pmerge(ZERO, REF_S2_1, TMP14); | |
44f54ceb | 1937 | |
bb270c08 | 1938 | vis_alignaddr_g0((void *)off); |
44f54ceb | 1939 | |
bb270c08 DB |
1940 | vis_ld64_2(ref, stride, TMP4); |
1941 | vis_faligndata(TMP0, TMP2, REF_S4); | |
44f54ceb | 1942 | |
bb270c08 DB |
1943 | vis_ld64_2(ref, stride_8, TMP6); |
1944 | ref += stride; | |
44f54ceb | 1945 | |
bb270c08 DB |
1946 | vis_ld64(dest[0], DST_0); |
1947 | vis_faligndata(TMP4, TMP6, REF_S0); | |
44f54ceb | 1948 | |
bb270c08 | 1949 | vis_ld64_2(dest, stride, DST_2); |
44f54ceb | 1950 | |
bb270c08 DB |
1951 | if (off != 0x7) { |
1952 | vis_alignaddr_g0((void *)off_plus_1); | |
1953 | vis_faligndata(TMP0, TMP2, REF_S6); | |
1954 | vis_faligndata(TMP4, TMP6, REF_S2); | |
1955 | } else { | |
1956 | vis_src1(TMP2, REF_S6); | |
1957 | vis_src1(TMP6, REF_S2); | |
1958 | } | |
44f54ceb | 1959 | |
bb270c08 DB |
1960 | vis_mul8x16al(DST_0, CONST_1024, TMP30); |
1961 | vis_pmerge(ZERO, REF_S4, TMP22); | |
44f54ceb | 1962 | |
bb270c08 DB |
1963 | vis_mul8x16al(DST_1, CONST_1024, TMP32); |
1964 | vis_pmerge(ZERO, REF_S4_1, TMP24); | |
44f54ceb | 1965 | |
bb270c08 DB |
1966 | vis_mul8x16au(REF_S6, CONST_256, TMP26); |
1967 | vis_pmerge(ZERO, REF_S6_1, TMP28); | |
44f54ceb | 1968 | |
bb270c08 DB |
1969 | vis_mul8x16au(REF_S0, CONST_256, REF_S4); |
1970 | vis_padd16(TMP22, CONST_6, TMP22); | |
44f54ceb | 1971 | |
bb270c08 DB |
1972 | vis_mul8x16au(REF_S0_1, CONST_256, REF_S6); |
1973 | vis_padd16(TMP24, CONST_6, TMP24); | |
44f54ceb | 1974 | |
bb270c08 DB |
1975 | vis_mul8x16al(DST_2, CONST_1024, REF_0); |
1976 | vis_padd16(TMP22, TMP26, TMP22); | |
44f54ceb | 1977 | |
bb270c08 DB |
1978 | vis_mul8x16al(DST_3, CONST_1024, REF_2); |
1979 | vis_padd16(TMP24, TMP28, TMP24); | |
44f54ceb | 1980 | |
bb270c08 DB |
1981 | vis_mul8x16au(REF_S2, CONST_256, TMP26); |
1982 | vis_padd16(TMP8, TMP22, TMP8); | |
44f54ceb | 1983 | |
bb270c08 DB |
1984 | vis_mul8x16au(REF_S2_1, CONST_256, TMP28); |
1985 | vis_padd16(TMP10, TMP24, TMP10); | |
44f54ceb | 1986 | |
bb270c08 | 1987 | vis_padd16(TMP8, TMP12, TMP8); |
44f54ceb | 1988 | |
bb270c08 | 1989 | vis_padd16(TMP10, TMP14, TMP10); |
44f54ceb | 1990 | |
bb270c08 | 1991 | vis_padd16(TMP8, TMP30, TMP8); |
44f54ceb | 1992 | |
bb270c08 DB |
1993 | vis_padd16(TMP10, TMP32, TMP10); |
1994 | vis_pack16(TMP8, DST_0); | |
44f54ceb | 1995 | |
bb270c08 DB |
1996 | vis_pack16(TMP10, DST_1); |
1997 | vis_st64(DST_0, dest[0]); | |
1998 | dest += stride; | |
44f54ceb | 1999 | |
bb270c08 | 2000 | vis_padd16(REF_S4, TMP22, TMP12); |
44f54ceb | 2001 | |
bb270c08 | 2002 | vis_padd16(REF_S6, TMP24, TMP14); |
44f54ceb | 2003 | |
bb270c08 | 2004 | vis_padd16(TMP12, TMP26, TMP12); |
44f54ceb | 2005 | |
bb270c08 | 2006 | vis_padd16(TMP14, TMP28, TMP14); |
44f54ceb | 2007 | |
bb270c08 | 2008 | vis_padd16(TMP12, REF_0, TMP12); |
44f54ceb | 2009 | |
bb270c08 DB |
2010 | vis_padd16(TMP14, REF_2, TMP14); |
2011 | vis_pack16(TMP12, DST_2); | |
44f54ceb | 2012 | |
bb270c08 DB |
2013 | vis_pack16(TMP14, DST_3); |
2014 | vis_st64(DST_2, dest[0]); | |
2015 | dest += stride; | |
2016 | } while (--height); | |
44f54ceb MN |
2017 | } |
2018 | ||
2019 | /* End of rounding code */ | |
2020 | ||
2021 | /* Start of no rounding code */ | |
2022 | /* The trick used in some of this file is the formula from the MMX | |
2023 | * motion comp code, which is: | |
2024 | * | |
2025 | * (x+y)>>1 == (x&y)+((x^y)>>1) | |
2026 | * | |
2027 | * This allows us to average 8 bytes at a time in a 64-bit FPU reg. | |
2028 | * We avoid overflows by masking before we do the shift, and we | |
2029 | * implement the shift by multiplying by 1/2 using mul8x16. So in | |
2030 | * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask | |
2031 | * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and | |
2032 | * the value 0x80808080 is in f8): | |
2033 | * | |
bb270c08 DB |
2034 | * fxor f0, f2, f10 |
2035 | * fand f10, f4, f10 | |
2036 | * fmul8x16 f8, f10, f10 | |
2037 | * fand f10, f6, f10 | |
2038 | * fand f0, f2, f12 | |
2039 | * fpadd16 f12, f10, f10 | |
44f54ceb MN |
2040 | */ |
2041 | ||
86decad6 | 2042 | static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 2043 | const int stride, int height) |
44f54ceb | 2044 | { |
bb270c08 DB |
2045 | ref = vis_alignaddr(ref); |
2046 | do { /* 5 cycles */ | |
2047 | vis_ld64(ref[0], TMP0); | |
44f54ceb | 2048 | |
bb270c08 | 2049 | vis_ld64_2(ref, 8, TMP2); |
44f54ceb | 2050 | |
bb270c08 DB |
2051 | vis_ld64_2(ref, 16, TMP4); |
2052 | ref += stride; | |
44f54ceb | 2053 | |
bb270c08 DB |
2054 | vis_faligndata(TMP0, TMP2, REF_0); |
2055 | vis_st64(REF_0, dest[0]); | |
44f54ceb | 2056 | |
bb270c08 DB |
2057 | vis_faligndata(TMP2, TMP4, REF_2); |
2058 | vis_st64_2(REF_2, dest, 8); | |
2059 | dest += stride; | |
2060 | } while (--height); | |
44f54ceb MN |
2061 | } |
2062 | ||
86decad6 | 2063 | static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 2064 | const int stride, int height) |
44f54ceb | 2065 | { |
bb270c08 DB |
2066 | ref = vis_alignaddr(ref); |
2067 | do { /* 4 cycles */ | |
2068 | vis_ld64(ref[0], TMP0); | |
44f54ceb | 2069 | |
bb270c08 DB |
2070 | vis_ld64(ref[8], TMP2); |
2071 | ref += stride; | |
44f54ceb | 2072 | |
bb270c08 | 2073 | /* stall */ |
44f54ceb | 2074 | |
bb270c08 DB |
2075 | vis_faligndata(TMP0, TMP2, REF_0); |
2076 | vis_st64(REF_0, dest[0]); | |
2077 | dest += stride; | |
2078 | } while (--height); | |
44f54ceb MN |
2079 | } |
2080 | ||
2081 | ||
86decad6 | 2082 | static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 2083 | const int stride, int height) |
44f54ceb | 2084 | { |
bb270c08 | 2085 | int stride_8 = stride + 8; |
44f54ceb | 2086 | |
bb270c08 | 2087 | ref = vis_alignaddr(ref); |
44f54ceb | 2088 | |
bb270c08 | 2089 | vis_ld64(ref[0], TMP0); |
44f54ceb | 2090 | |
bb270c08 | 2091 | vis_ld64(ref[8], TMP2); |
44f54ceb | 2092 | |
bb270c08 | 2093 | vis_ld64(ref[16], TMP4); |
44f54ceb | 2094 | |
bb270c08 | 2095 | vis_ld64(dest[0], DST_0); |
44f54ceb | 2096 | |
bb270c08 | 2097 | vis_ld64(dest[8], DST_2); |
44f54ceb | 2098 | |
bb270c08 DB |
2099 | vis_ld64(constants_fe[0], MASK_fe); |
2100 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 2101 | |
bb270c08 DB |
2102 | vis_ld64(constants_7f[0], MASK_7f); |
2103 | vis_faligndata(TMP2, TMP4, REF_2); | |
44f54ceb | 2104 | |
bb270c08 | 2105 | vis_ld64(constants128[0], CONST_128); |
44f54ceb | 2106 | |
bb270c08 DB |
2107 | ref += stride; |
2108 | height = (height >> 1) - 1; | |
44f54ceb | 2109 | |
bb270c08 DB |
2110 | do { /* 24 cycles */ |
2111 | vis_ld64(ref[0], TMP0); | |
2112 | vis_xor(DST_0, REF_0, TMP6); | |
44f54ceb | 2113 | |
bb270c08 DB |
2114 | vis_ld64_2(ref, 8, TMP2); |
2115 | vis_and(TMP6, MASK_fe, TMP6); | |
44f54ceb | 2116 | |
bb270c08 DB |
2117 | vis_ld64_2(ref, 16, TMP4); |
2118 | ref += stride; | |
2119 | vis_mul8x16(CONST_128, TMP6, TMP6); | |
2120 | vis_xor(DST_2, REF_2, TMP8); | |
44f54ceb | 2121 | |
bb270c08 | 2122 | vis_and(TMP8, MASK_fe, TMP8); |
44f54ceb | 2123 | |
bb270c08 DB |
2124 | vis_and(DST_0, REF_0, TMP10); |
2125 | vis_ld64_2(dest, stride, DST_0); | |
2126 | vis_mul8x16(CONST_128, TMP8, TMP8); | |
44f54ceb | 2127 | |
bb270c08 DB |
2128 | vis_and(DST_2, REF_2, TMP12); |
2129 | vis_ld64_2(dest, stride_8, DST_2); | |
44f54ceb | 2130 | |
bb270c08 DB |
2131 | vis_ld64(ref[0], TMP14); |
2132 | vis_and(TMP6, MASK_7f, TMP6); | |
44f54ceb | 2133 | |
bb270c08 | 2134 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 2135 | |
bb270c08 DB |
2136 | vis_padd16(TMP10, TMP6, TMP6); |
2137 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 2138 | |
bb270c08 DB |
2139 | vis_padd16(TMP12, TMP8, TMP8); |
2140 | vis_st64_2(TMP8, dest, 8); | |
44f54ceb | 2141 | |
bb270c08 DB |
2142 | dest += stride; |
2143 | vis_ld64_2(ref, 8, TMP16); | |
2144 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 2145 | |
bb270c08 DB |
2146 | vis_ld64_2(ref, 16, TMP18); |
2147 | vis_faligndata(TMP2, TMP4, REF_2); | |
2148 | ref += stride; | |
44f54ceb | 2149 | |
bb270c08 | 2150 | vis_xor(DST_0, REF_0, TMP20); |
44f54ceb | 2151 | |
bb270c08 | 2152 | vis_and(TMP20, MASK_fe, TMP20); |
44f54ceb | 2153 | |
bb270c08 DB |
2154 | vis_xor(DST_2, REF_2, TMP22); |
2155 | vis_mul8x16(CONST_128, TMP20, TMP20); | |
44f54ceb | 2156 | |
bb270c08 | 2157 | vis_and(TMP22, MASK_fe, TMP22); |
44f54ceb | 2158 | |
bb270c08 DB |
2159 | vis_and(DST_0, REF_0, TMP24); |
2160 | vis_mul8x16(CONST_128, TMP22, TMP22); | |
44f54ceb | 2161 | |
bb270c08 | 2162 | vis_and(DST_2, REF_2, TMP26); |
44f54ceb | 2163 | |
bb270c08 DB |
2164 | vis_ld64_2(dest, stride, DST_0); |
2165 | vis_faligndata(TMP14, TMP16, REF_0); | |
44f54ceb | 2166 | |
bb270c08 DB |
2167 | vis_ld64_2(dest, stride_8, DST_2); |
2168 | vis_faligndata(TMP16, TMP18, REF_2); | |
44f54ceb | 2169 | |
bb270c08 | 2170 | vis_and(TMP20, MASK_7f, TMP20); |
44f54ceb | 2171 | |
bb270c08 | 2172 | vis_and(TMP22, MASK_7f, TMP22); |
44f54ceb | 2173 | |
bb270c08 DB |
2174 | vis_padd16(TMP24, TMP20, TMP20); |
2175 | vis_st64(TMP20, dest[0]); | |
44f54ceb | 2176 | |
bb270c08 DB |
2177 | vis_padd16(TMP26, TMP22, TMP22); |
2178 | vis_st64_2(TMP22, dest, 8); | |
2179 | dest += stride; | |
2180 | } while (--height); | |
44f54ceb | 2181 | |
bb270c08 DB |
2182 | vis_ld64(ref[0], TMP0); |
2183 | vis_xor(DST_0, REF_0, TMP6); | |
44f54ceb | 2184 | |
bb270c08 DB |
2185 | vis_ld64_2(ref, 8, TMP2); |
2186 | vis_and(TMP6, MASK_fe, TMP6); | |
44f54ceb | 2187 | |
bb270c08 DB |
2188 | vis_ld64_2(ref, 16, TMP4); |
2189 | vis_mul8x16(CONST_128, TMP6, TMP6); | |
2190 | vis_xor(DST_2, REF_2, TMP8); | |
44f54ceb | 2191 | |
bb270c08 | 2192 | vis_and(TMP8, MASK_fe, TMP8); |
44f54ceb | 2193 | |
bb270c08 DB |
2194 | vis_and(DST_0, REF_0, TMP10); |
2195 | vis_ld64_2(dest, stride, DST_0); | |
2196 | vis_mul8x16(CONST_128, TMP8, TMP8); | |
44f54ceb | 2197 | |
bb270c08 DB |
2198 | vis_and(DST_2, REF_2, TMP12); |
2199 | vis_ld64_2(dest, stride_8, DST_2); | |
44f54ceb | 2200 | |
bb270c08 DB |
2201 | vis_ld64(ref[0], TMP14); |
2202 | vis_and(TMP6, MASK_7f, TMP6); | |
44f54ceb | 2203 | |
bb270c08 | 2204 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 2205 | |
bb270c08 DB |
2206 | vis_padd16(TMP10, TMP6, TMP6); |
2207 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 2208 | |
bb270c08 DB |
2209 | vis_padd16(TMP12, TMP8, TMP8); |
2210 | vis_st64_2(TMP8, dest, 8); | |
44f54ceb | 2211 | |
bb270c08 DB |
2212 | dest += stride; |
2213 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 2214 | |
bb270c08 | 2215 | vis_faligndata(TMP2, TMP4, REF_2); |
44f54ceb | 2216 | |
bb270c08 | 2217 | vis_xor(DST_0, REF_0, TMP20); |
44f54ceb | 2218 | |
bb270c08 | 2219 | vis_and(TMP20, MASK_fe, TMP20); |
44f54ceb | 2220 | |
bb270c08 DB |
2221 | vis_xor(DST_2, REF_2, TMP22); |
2222 | vis_mul8x16(CONST_128, TMP20, TMP20); | |
44f54ceb | 2223 | |
bb270c08 | 2224 | vis_and(TMP22, MASK_fe, TMP22); |
44f54ceb | 2225 | |
bb270c08 DB |
2226 | vis_and(DST_0, REF_0, TMP24); |
2227 | vis_mul8x16(CONST_128, TMP22, TMP22); | |
44f54ceb | 2228 | |
bb270c08 | 2229 | vis_and(DST_2, REF_2, TMP26); |
44f54ceb | 2230 | |
bb270c08 | 2231 | vis_and(TMP20, MASK_7f, TMP20); |
44f54ceb | 2232 | |
bb270c08 | 2233 | vis_and(TMP22, MASK_7f, TMP22); |
44f54ceb | 2234 | |
bb270c08 DB |
2235 | vis_padd16(TMP24, TMP20, TMP20); |
2236 | vis_st64(TMP20, dest[0]); | |
44f54ceb | 2237 | |
bb270c08 DB |
2238 | vis_padd16(TMP26, TMP22, TMP22); |
2239 | vis_st64_2(TMP22, dest, 8); | |
44f54ceb MN |
2240 | } |
2241 | ||
86decad6 | 2242 | static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 2243 | const int stride, int height) |
44f54ceb | 2244 | { |
bb270c08 | 2245 | ref = vis_alignaddr(ref); |
44f54ceb | 2246 | |
bb270c08 | 2247 | vis_ld64(ref[0], TMP0); |
44f54ceb | 2248 | |
bb270c08 | 2249 | vis_ld64(ref[8], TMP2); |
44f54ceb | 2250 | |
bb270c08 | 2251 | vis_ld64(dest[0], DST_0); |
44f54ceb | 2252 | |
bb270c08 | 2253 | vis_ld64(constants_fe[0], MASK_fe); |
44f54ceb | 2254 | |
bb270c08 DB |
2255 | vis_ld64(constants_7f[0], MASK_7f); |
2256 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 2257 | |
bb270c08 | 2258 | vis_ld64(constants128[0], CONST_128); |
44f54ceb | 2259 | |
bb270c08 DB |
2260 | ref += stride; |
2261 | height = (height >> 1) - 1; | |
44f54ceb | 2262 | |
bb270c08 DB |
2263 | do { /* 12 cycles */ |
2264 | vis_ld64(ref[0], TMP0); | |
2265 | vis_xor(DST_0, REF_0, TMP4); | |
44f54ceb | 2266 | |
bb270c08 DB |
2267 | vis_ld64(ref[8], TMP2); |
2268 | vis_and(TMP4, MASK_fe, TMP4); | |
44f54ceb | 2269 | |
bb270c08 DB |
2270 | vis_and(DST_0, REF_0, TMP6); |
2271 | vis_ld64_2(dest, stride, DST_0); | |
2272 | ref += stride; | |
2273 | vis_mul8x16(CONST_128, TMP4, TMP4); | |
44f54ceb | 2274 | |
bb270c08 DB |
2275 | vis_ld64(ref[0], TMP12); |
2276 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 2277 | |
bb270c08 DB |
2278 | vis_ld64(ref[8], TMP2); |
2279 | vis_xor(DST_0, REF_0, TMP0); | |
2280 | ref += stride; | |
44f54ceb | 2281 | |
bb270c08 | 2282 | vis_and(TMP0, MASK_fe, TMP0); |
44f54ceb | 2283 | |
bb270c08 | 2284 | vis_and(TMP4, MASK_7f, TMP4); |
44f54ceb | 2285 | |
bb270c08 DB |
2286 | vis_padd16(TMP6, TMP4, TMP4); |
2287 | vis_st64(TMP4, dest[0]); | |
2288 | dest += stride; | |
2289 | vis_mul8x16(CONST_128, TMP0, TMP0); | |
44f54ceb | 2290 | |
bb270c08 DB |
2291 | vis_and(DST_0, REF_0, TMP6); |
2292 | vis_ld64_2(dest, stride, DST_0); | |
44f54ceb | 2293 | |
bb270c08 | 2294 | vis_faligndata(TMP12, TMP2, REF_0); |
44f54ceb | 2295 | |
bb270c08 | 2296 | vis_and(TMP0, MASK_7f, TMP0); |
44f54ceb | 2297 | |
bb270c08 DB |
2298 | vis_padd16(TMP6, TMP0, TMP4); |
2299 | vis_st64(TMP4, dest[0]); | |
2300 | dest += stride; | |
2301 | } while (--height); | |
44f54ceb | 2302 | |
bb270c08 DB |
2303 | vis_ld64(ref[0], TMP0); |
2304 | vis_xor(DST_0, REF_0, TMP4); | |
44f54ceb | 2305 | |
bb270c08 DB |
2306 | vis_ld64(ref[8], TMP2); |
2307 | vis_and(TMP4, MASK_fe, TMP4); | |
44f54ceb | 2308 | |
bb270c08 DB |
2309 | vis_and(DST_0, REF_0, TMP6); |
2310 | vis_ld64_2(dest, stride, DST_0); | |
2311 | vis_mul8x16(CONST_128, TMP4, TMP4); | |
44f54ceb | 2312 | |
bb270c08 | 2313 | vis_faligndata(TMP0, TMP2, REF_0); |
44f54ceb | 2314 | |
bb270c08 | 2315 | vis_xor(DST_0, REF_0, TMP0); |
44f54ceb | 2316 | |
bb270c08 | 2317 | vis_and(TMP0, MASK_fe, TMP0); |
44f54ceb | 2318 | |
bb270c08 | 2319 | vis_and(TMP4, MASK_7f, TMP4); |
44f54ceb | 2320 | |
bb270c08 DB |
2321 | vis_padd16(TMP6, TMP4, TMP4); |
2322 | vis_st64(TMP4, dest[0]); | |
2323 | dest += stride; | |
2324 | vis_mul8x16(CONST_128, TMP0, TMP0); | |
44f54ceb | 2325 | |
bb270c08 | 2326 | vis_and(DST_0, REF_0, TMP6); |
44f54ceb | 2327 | |
bb270c08 | 2328 | vis_and(TMP0, MASK_7f, TMP0); |
44f54ceb | 2329 | |
bb270c08 DB |
2330 | vis_padd16(TMP6, TMP0, TMP4); |
2331 | vis_st64(TMP4, dest[0]); | |
44f54ceb MN |
2332 | } |
2333 | ||
86decad6 | 2334 | static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 2335 | const int stride, int height) |
44f54ceb | 2336 | { |
bb270c08 DB |
2337 | unsigned long off = (unsigned long) ref & 0x7; |
2338 | unsigned long off_plus_1 = off + 1; | |
44f54ceb | 2339 | |
bb270c08 | 2340 | ref = vis_alignaddr(ref); |
44f54ceb | 2341 | |
bb270c08 | 2342 | vis_ld64(ref[0], TMP0); |
44f54ceb | 2343 | |
bb270c08 | 2344 | vis_ld64_2(ref, 8, TMP2); |
44f54ceb | 2345 | |
bb270c08 | 2346 | vis_ld64_2(ref, 16, TMP4); |
44f54ceb | 2347 | |
bb270c08 | 2348 | vis_ld64(constants_fe[0], MASK_fe); |
44f54ceb | 2349 | |
bb270c08 DB |
2350 | vis_ld64(constants_7f[0], MASK_7f); |
2351 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 2352 | |
bb270c08 DB |
2353 | vis_ld64(constants128[0], CONST_128); |
2354 | vis_faligndata(TMP2, TMP4, REF_4); | |
44f54ceb | 2355 | |
bb270c08 DB |
2356 | if (off != 0x7) { |
2357 | vis_alignaddr_g0((void *)off_plus_1); | |
2358 | vis_faligndata(TMP0, TMP2, REF_2); | |
2359 | vis_faligndata(TMP2, TMP4, REF_6); | |
2360 | } else { | |
2361 | vis_src1(TMP2, REF_2); | |
2362 | vis_src1(TMP4, REF_6); | |
2363 | } | |
44f54ceb | 2364 | |
bb270c08 DB |
2365 | ref += stride; |
2366 | height = (height >> 1) - 1; | |
44f54ceb | 2367 | |
bb270c08 DB |
2368 | do { /* 34 cycles */ |
2369 | vis_ld64(ref[0], TMP0); | |
2370 | vis_xor(REF_0, REF_2, TMP6); | |
44f54ceb | 2371 | |
bb270c08 DB |
2372 | vis_ld64_2(ref, 8, TMP2); |
2373 | vis_xor(REF_4, REF_6, TMP8); | |
44f54ceb | 2374 | |
bb270c08 DB |
2375 | vis_ld64_2(ref, 16, TMP4); |
2376 | vis_and(TMP6, MASK_fe, TMP6); | |
2377 | ref += stride; | |
44f54ceb | 2378 | |
bb270c08 DB |
2379 | vis_ld64(ref[0], TMP14); |
2380 | vis_mul8x16(CONST_128, TMP6, TMP6); | |
2381 | vis_and(TMP8, MASK_fe, TMP8); | |
44f54ceb | 2382 | |
bb270c08 DB |
2383 | vis_ld64_2(ref, 8, TMP16); |
2384 | vis_mul8x16(CONST_128, TMP8, TMP8); | |
2385 | vis_and(REF_0, REF_2, TMP10); | |
44f54ceb | 2386 | |
bb270c08 DB |
2387 | vis_ld64_2(ref, 16, TMP18); |
2388 | ref += stride; | |
2389 | vis_and(REF_4, REF_6, TMP12); | |
44f54ceb | 2390 | |
bb270c08 | 2391 | vis_alignaddr_g0((void *)off); |
44f54ceb | 2392 | |
bb270c08 | 2393 | vis_faligndata(TMP0, TMP2, REF_0); |
44f54ceb | 2394 | |
bb270c08 | 2395 | vis_faligndata(TMP2, TMP4, REF_4); |
44f54ceb | 2396 | |
bb270c08 DB |
2397 | if (off != 0x7) { |
2398 | vis_alignaddr_g0((void *)off_plus_1); | |
2399 | vis_faligndata(TMP0, TMP2, REF_2); | |
2400 | vis_faligndata(TMP2, TMP4, REF_6); | |
2401 | } else { | |
2402 | vis_src1(TMP2, REF_2); | |
2403 | vis_src1(TMP4, REF_6); | |
2404 | } | |
44f54ceb | 2405 | |
bb270c08 | 2406 | vis_and(TMP6, MASK_7f, TMP6); |
44f54ceb | 2407 | |
bb270c08 | 2408 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 2409 | |
bb270c08 DB |
2410 | vis_padd16(TMP10, TMP6, TMP6); |
2411 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 2412 | |
bb270c08 DB |
2413 | vis_padd16(TMP12, TMP8, TMP8); |
2414 | vis_st64_2(TMP8, dest, 8); | |
2415 | dest += stride; | |
44f54ceb | 2416 | |
bb270c08 | 2417 | vis_xor(REF_0, REF_2, TMP6); |
44f54ceb | 2418 | |
bb270c08 | 2419 | vis_xor(REF_4, REF_6, TMP8); |
44f54ceb | 2420 | |
bb270c08 | 2421 | vis_and(TMP6, MASK_fe, TMP6); |
44f54ceb | 2422 | |
bb270c08 DB |
2423 | vis_mul8x16(CONST_128, TMP6, TMP6); |
2424 | vis_and(TMP8, MASK_fe, TMP8); | |
44f54ceb | 2425 | |
bb270c08 DB |
2426 | vis_mul8x16(CONST_128, TMP8, TMP8); |
2427 | vis_and(REF_0, REF_2, TMP10); | |
44f54ceb | 2428 | |
bb270c08 | 2429 | vis_and(REF_4, REF_6, TMP12); |
44f54ceb | 2430 | |
bb270c08 | 2431 | vis_alignaddr_g0((void *)off); |
44f54ceb | 2432 | |
bb270c08 | 2433 | vis_faligndata(TMP14, TMP16, REF_0); |
44f54ceb | 2434 | |
bb270c08 | 2435 | vis_faligndata(TMP16, TMP18, REF_4); |
44f54ceb | 2436 | |
bb270c08 DB |
2437 | if (off != 0x7) { |
2438 | vis_alignaddr_g0((void *)off_plus_1); | |
2439 | vis_faligndata(TMP14, TMP16, REF_2); | |
2440 | vis_faligndata(TMP16, TMP18, REF_6); | |
2441 | } else { | |
2442 | vis_src1(TMP16, REF_2); | |
2443 | vis_src1(TMP18, REF_6); | |
2444 | } | |
44f54ceb | 2445 | |
bb270c08 | 2446 | vis_and(TMP6, MASK_7f, TMP6); |
44f54ceb | 2447 | |
bb270c08 | 2448 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 2449 | |
bb270c08 DB |
2450 | vis_padd16(TMP10, TMP6, TMP6); |
2451 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 2452 | |
bb270c08 DB |
2453 | vis_padd16(TMP12, TMP8, TMP8); |
2454 | vis_st64_2(TMP8, dest, 8); | |
2455 | dest += stride; | |
2456 | } while (--height); | |
44f54ceb | 2457 | |
bb270c08 DB |
2458 | vis_ld64(ref[0], TMP0); |
2459 | vis_xor(REF_0, REF_2, TMP6); | |
44f54ceb | 2460 | |
bb270c08 DB |
2461 | vis_ld64_2(ref, 8, TMP2); |
2462 | vis_xor(REF_4, REF_6, TMP8); | |
44f54ceb | 2463 | |
bb270c08 DB |
2464 | vis_ld64_2(ref, 16, TMP4); |
2465 | vis_and(TMP6, MASK_fe, TMP6); | |
44f54ceb | 2466 | |
bb270c08 DB |
2467 | vis_mul8x16(CONST_128, TMP6, TMP6); |
2468 | vis_and(TMP8, MASK_fe, TMP8); | |
44f54ceb | 2469 | |
bb270c08 DB |
2470 | vis_mul8x16(CONST_128, TMP8, TMP8); |
2471 | vis_and(REF_0, REF_2, TMP10); | |
44f54ceb | 2472 | |
bb270c08 | 2473 | vis_and(REF_4, REF_6, TMP12); |
44f54ceb | 2474 | |
bb270c08 | 2475 | vis_alignaddr_g0((void *)off); |
44f54ceb | 2476 | |
bb270c08 | 2477 | vis_faligndata(TMP0, TMP2, REF_0); |
44f54ceb | 2478 | |
bb270c08 | 2479 | vis_faligndata(TMP2, TMP4, REF_4); |
44f54ceb | 2480 | |
bb270c08 DB |
2481 | if (off != 0x7) { |
2482 | vis_alignaddr_g0((void *)off_plus_1); | |
2483 | vis_faligndata(TMP0, TMP2, REF_2); | |
2484 | vis_faligndata(TMP2, TMP4, REF_6); | |
2485 | } else { | |
2486 | vis_src1(TMP2, REF_2); | |
2487 | vis_src1(TMP4, REF_6); | |
2488 | } | |
44f54ceb | 2489 | |
bb270c08 | 2490 | vis_and(TMP6, MASK_7f, TMP6); |
44f54ceb | 2491 | |
bb270c08 | 2492 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 2493 | |
bb270c08 DB |
2494 | vis_padd16(TMP10, TMP6, TMP6); |
2495 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 2496 | |
bb270c08 DB |
2497 | vis_padd16(TMP12, TMP8, TMP8); |
2498 | vis_st64_2(TMP8, dest, 8); | |
2499 | dest += stride; | |
44f54ceb | 2500 | |
bb270c08 | 2501 | vis_xor(REF_0, REF_2, TMP6); |
44f54ceb | 2502 | |
bb270c08 | 2503 | vis_xor(REF_4, REF_6, TMP8); |
44f54ceb | 2504 | |
bb270c08 | 2505 | vis_and(TMP6, MASK_fe, TMP6); |
44f54ceb | 2506 | |
bb270c08 DB |
2507 | vis_mul8x16(CONST_128, TMP6, TMP6); |
2508 | vis_and(TMP8, MASK_fe, TMP8); | |
44f54ceb | 2509 | |
bb270c08 DB |
2510 | vis_mul8x16(CONST_128, TMP8, TMP8); |
2511 | vis_and(REF_0, REF_2, TMP10); | |
44f54ceb | 2512 | |
bb270c08 | 2513 | vis_and(REF_4, REF_6, TMP12); |
44f54ceb | 2514 | |
bb270c08 | 2515 | vis_and(TMP6, MASK_7f, TMP6); |
44f54ceb | 2516 | |
bb270c08 | 2517 | vis_and(TMP8, MASK_7f, TMP8); |
44f54ceb | 2518 | |
bb270c08 DB |
2519 | vis_padd16(TMP10, TMP6, TMP6); |
2520 | vis_st64(TMP6, dest[0]); | |
44f54ceb | 2521 | |
bb270c08 DB |
2522 | vis_padd16(TMP12, TMP8, TMP8); |
2523 | vis_st64_2(TMP8, dest, 8); | |
44f54ceb MN |
2524 | } |
2525 | ||
86decad6 | 2526 | static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 2527 | const int stride, int height) |
44f54ceb | 2528 | { |
bb270c08 DB |
2529 | unsigned long off = (unsigned long) ref & 0x7; |
2530 | unsigned long off_plus_1 = off + 1; | |
44f54ceb | 2531 | |
bb270c08 | 2532 | ref = vis_alignaddr(ref); |
44f54ceb | 2533 | |
bb270c08 | 2534 | vis_ld64(ref[0], TMP0); |
44f54ceb | 2535 | |
bb270c08 | 2536 | vis_ld64(ref[8], TMP2); |
44f54ceb | 2537 | |
bb270c08 | 2538 | vis_ld64(constants_fe[0], MASK_fe); |
44f54ceb | 2539 | |
bb270c08 | 2540 | vis_ld64(constants_7f[0], MASK_7f); |
44f54ceb | 2541 | |
bb270c08 DB |
2542 | vis_ld64(constants128[0], CONST_128); |
2543 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 2544 | |
bb270c08 DB |
2545 | if (off != 0x7) { |
2546 | vis_alignaddr_g0((void *)off_plus_1); | |
2547 | vis_faligndata(TMP0, TMP2, REF_2); | |
2548 | } else { | |
2549 | vis_src1(TMP2, REF_2); | |
2550 | } | |
44f54ceb | 2551 | |
bb270c08 DB |
2552 | ref += stride; |
2553 | height = (height >> 1) - 1; | |
44f54ceb | 2554 | |
bb270c08 DB |
2555 | do { /* 20 cycles */ |
2556 | vis_ld64(ref[0], TMP0); | |
2557 | vis_xor(REF_0, REF_2, TMP4); | |
44f54ceb | 2558 | |
bb270c08 DB |
2559 | vis_ld64_2(ref, 8, TMP2); |
2560 | vis_and(TMP4, MASK_fe, TMP4); | |
2561 | ref += stride; | |
44f54ceb | 2562 | |
bb270c08 DB |
2563 | vis_ld64(ref[0], TMP8); |
2564 | vis_and(REF_0, REF_2, TMP6); | |
2565 | vis_mul8x16(CONST_128, TMP4, TMP4); | |
44f54ceb | 2566 | |
bb270c08 | 2567 | vis_alignaddr_g0((void *)off); |
44f54ceb | 2568 | |
bb270c08 DB |
2569 | vis_ld64_2(ref, 8, TMP10); |
2570 | ref += stride; | |
2571 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 2572 | |
bb270c08 DB |
2573 | if (off != 0x7) { |
2574 | vis_alignaddr_g0((void *)off_plus_1); | |
2575 | vis_faligndata(TMP0, TMP2, REF_2); | |
2576 | } else { | |
2577 | vis_src1(TMP2, REF_2); | |
2578 | } | |
44f54ceb | 2579 | |
bb270c08 | 2580 | vis_and(TMP4, MASK_7f, TMP4); |
44f54ceb | 2581 | |
bb270c08 DB |
2582 | vis_padd16(TMP6, TMP4, DST_0); |
2583 | vis_st64(DST_0, dest[0]); | |
2584 | dest += stride; | |
44f54ceb | 2585 | |
bb270c08 | 2586 | vis_xor(REF_0, REF_2, TMP12); |
44f54ceb | 2587 | |
bb270c08 | 2588 | vis_and(TMP12, MASK_fe, TMP12); |
44f54ceb | 2589 | |
bb270c08 DB |
2590 | vis_and(REF_0, REF_2, TMP14); |
2591 | vis_mul8x16(CONST_128, TMP12, TMP12); | |
44f54ceb | 2592 | |
bb270c08 DB |
2593 | vis_alignaddr_g0((void *)off); |
2594 | vis_faligndata(TMP8, TMP10, REF_0); | |
2595 | if (off != 0x7) { | |
2596 | vis_alignaddr_g0((void *)off_plus_1); | |
2597 | vis_faligndata(TMP8, TMP10, REF_2); | |
2598 | } else { | |
2599 | vis_src1(TMP10, REF_2); | |
2600 | } | |
44f54ceb | 2601 | |
bb270c08 | 2602 | vis_and(TMP12, MASK_7f, TMP12); |
44f54ceb | 2603 | |
bb270c08 DB |
2604 | vis_padd16(TMP14, TMP12, DST_0); |
2605 | vis_st64(DST_0, dest[0]); | |
2606 | dest += stride; | |
2607 | } while (--height); | |
44f54ceb | 2608 | |
bb270c08 DB |
2609 | vis_ld64(ref[0], TMP0); |
2610 | vis_xor(REF_0, REF_2, TMP4); | |
44f54ceb | 2611 | |
bb270c08 DB |
2612 | vis_ld64_2(ref, 8, TMP2); |
2613 | vis_and(TMP4, MASK_fe, TMP4); | |
44f54ceb | 2614 | |
bb270c08 DB |
2615 | vis_and(REF_0, REF_2, TMP6); |
2616 | vis_mul8x16(CONST_128, TMP4, TMP4); | |
44f54ceb | 2617 | |
bb270c08 | 2618 | vis_alignaddr_g0((void *)off); |
44f54ceb | 2619 | |
bb270c08 | 2620 | vis_faligndata(TMP0, TMP2, REF_0); |
44f54ceb | 2621 | |
bb270c08 DB |
2622 | if (off != 0x7) { |
2623 | vis_alignaddr_g0((void *)off_plus_1); | |
2624 | vis_faligndata(TMP0, TMP2, REF_2); | |
2625 | } else { | |
2626 | vis_src1(TMP2, REF_2); | |
2627 | } | |
44f54ceb | 2628 | |
bb270c08 | 2629 | vis_and(TMP4, MASK_7f, TMP4); |
44f54ceb | 2630 | |
bb270c08 DB |
2631 | vis_padd16(TMP6, TMP4, DST_0); |
2632 | vis_st64(DST_0, dest[0]); | |
2633 | dest += stride; | |
44f54ceb | 2634 | |
bb270c08 | 2635 | vis_xor(REF_0, REF_2, TMP12); |
44f54ceb | 2636 | |
bb270c08 | 2637 | vis_and(TMP12, MASK_fe, TMP12); |
44f54ceb | 2638 | |
bb270c08 DB |
2639 | vis_and(REF_0, REF_2, TMP14); |
2640 | vis_mul8x16(CONST_128, TMP12, TMP12); | |
44f54ceb | 2641 | |
bb270c08 | 2642 | vis_and(TMP12, MASK_7f, TMP12); |
44f54ceb | 2643 | |
bb270c08 DB |
2644 | vis_padd16(TMP14, TMP12, DST_0); |
2645 | vis_st64(DST_0, dest[0]); | |
2646 | dest += stride; | |
44f54ceb MN |
2647 | } |
2648 | ||
86decad6 | 2649 | static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 2650 | const int stride, int height) |
44f54ceb | 2651 | { |
bb270c08 DB |
2652 | unsigned long off = (unsigned long) ref & 0x7; |
2653 | unsigned long off_plus_1 = off + 1; | |
44f54ceb | 2654 | |
bb270c08 | 2655 | vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); |
44f54ceb | 2656 | |
bb270c08 DB |
2657 | vis_ld64(constants3[0], CONST_3); |
2658 | vis_fzero(ZERO); | |
2659 | vis_ld64(constants256_512[0], CONST_256); | |
44f54ceb | 2660 | |
bb270c08 DB |
2661 | ref = vis_alignaddr(ref); |
2662 | do { /* 26 cycles */ | |
2663 | vis_ld64(ref[0], TMP0); | |
44f54ceb | 2664 | |
bb270c08 | 2665 | vis_ld64(ref[8], TMP2); |
44f54ceb | 2666 | |
bb270c08 | 2667 | vis_alignaddr_g0((void *)off); |
44f54ceb | 2668 | |
bb270c08 | 2669 | vis_ld64(ref[16], TMP4); |
44f54ceb | 2670 | |
bb270c08 DB |
2671 | vis_ld64(dest[0], DST_0); |
2672 | vis_faligndata(TMP0, TMP2, REF_0); | |
44f54ceb | 2673 | |
bb270c08 DB |
2674 | vis_ld64(dest[8], DST_2); |
2675 | vis_faligndata(TMP2, TMP4, REF_4); | |
44f54ceb | 2676 | |
bb270c08 DB |
2677 | if (off != 0x7) { |
2678 | vis_alignaddr_g0((void *)off_plus_1); | |
2679 | vis_faligndata(TMP0, TMP2, REF_2); | |
2680 | vis_faligndata(TMP2, TMP4, REF_6); | |
2681 | } else { | |
2682 | vis_src1(TMP2, REF_2); | |
2683 | vis_src1(TMP4, REF_6); | |
2684 | } | |
44f54ceb | 2685 | |
bb270c08 | 2686 | vis_mul8x16au(REF_0, CONST_256, TMP0); |
44f54ceb | 2687 | |
bb270c08 DB |
2688 | vis_pmerge(ZERO, REF_2, TMP4); |
2689 | vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
44f54ceb | 2690 | |
bb270c08 | 2691 | vis_pmerge(ZERO, REF_2_1, TMP6); |
44f54ceb | 2692 | |
bb270c08 | 2693 | vis_padd16(TMP0, TMP4, TMP0); |
44f54ceb | 2694 | |
bb270c08 DB |
2695 | vis_mul8x16al(DST_0, CONST_512, TMP4); |
2696 | vis_padd16(TMP2, TMP6, TMP2); | |
44f54ceb | 2697 | |
bb270c08 | 2698 | vis_mul8x16al(DST_1, CONST_512, TMP6); |
44f54ceb | 2699 | |
bb270c08 | 2700 | vis_mul8x16au(REF_6, CONST_256, TMP12); |
44f54ceb | 2701 | |
bb270c08 DB |
2702 | vis_padd16(TMP0, TMP4, TMP0); |
2703 | vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
44f54ceb | 2704 | |
bb270c08 DB |
2705 | vis_padd16(TMP2, TMP6, TMP2); |
2706 | vis_mul8x16au(REF_4, CONST_256, TMP16); | |
44f54ceb | 2707 | |
bb270c08 DB |
2708 | vis_padd16(TMP0, CONST_3, TMP8); |
2709 | vis_mul8x16au(REF_4_1, CONST_256, TMP18); | |
44f54ceb | 2710 | |
bb270c08 DB |
2711 | vis_padd16(TMP2, CONST_3, TMP10); |
2712 | vis_pack16(TMP8, DST_0); | |
44f54ceb | 2713 | |
bb270c08 DB |
2714 | vis_pack16(TMP10, DST_1); |
2715 | vis_padd16(TMP16, TMP12, TMP0); | |
44f54ceb | 2716 | |
bb270c08 DB |
2717 | vis_st64(DST_0, dest[0]); |
2718 | vis_mul8x16al(DST_2, CONST_512, TMP4); | |
2719 | vis_padd16(TMP18, TMP14, TMP2); | |
44f54ceb | 2720 | |
bb270c08 DB |
2721 | vis_mul8x16al(DST_3, CONST_512, TMP6); |
2722 | vis_padd16(TMP0, CONST_3, TMP0); | |
44f54ceb | 2723 | |
bb270c08 | 2724 | vis_padd16(TMP2, CONST_3, TMP2); |
44f54ceb | 2725 | |
bb270c08 | 2726 | vis_padd16(TMP0, TMP4, TMP0); |
44f54ceb | 2727 | |
bb270c08 DB |
2728 | vis_padd16(TMP2, TMP6, TMP2); |
2729 | vis_pack16(TMP0, DST_2); | |
44f54ceb | 2730 | |
bb270c08 DB |
2731 | vis_pack16(TMP2, DST_3); |
2732 | vis_st64(DST_2, dest[8]); | |
44f54ceb | 2733 | |
bb270c08 DB |
2734 | ref += stride; |
2735 | dest += stride; | |
2736 | } while (--height); | |
44f54ceb MN |
2737 | } |
2738 | ||
86decad6 | 2739 | static void MC_avg_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref, |
bb270c08 | 2740 | const int stride, int height) |
44f54ceb | 2741 | { |
bb270c08 DB |
2742 | unsigned long off = (unsigned long) ref & 0x7; |
2743 | unsigned long off_plus_1 = off + 1; | |
2744 | int stride_times_2 = stride << 1; | |
44f54ceb | 2745 | |
bb270c08 | 2746 | vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); |
44f54ceb | 2747 | |
bb270c08 DB |