spelling
[libav.git] / libavcodec / sparc / dsputil_vis.c
CommitLineData
44f54ceb 1/*
0f12310f 2 * dsputil_vis.c
44f54ceb
MN
3 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
4 *
b78e7197 5 * This file is part of FFmpeg.
44f54ceb 6 *
a33fe572
DB
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
44f54ceb 11 *
b78e7197 12 * FFmpeg is distributed in the hope that it will be useful,
44f54ceb 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
a33fe572
DB
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
44f54ceb 16 *
a33fe572
DB
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
5509bffa 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
44f54ceb
MN
20 */
21
0f12310f 22/* The *no_round* functions have been added by James A. Morrison, 2003,2004.
44f54ceb 23 The vis code from libmpeg2 was adapted for ffmpeg by James A. Morrison.
44f54ceb
MN
24 */
25
26#include "config.h"
27
44f54ceb
MN
28#include <inttypes.h>
29
b550bfaa 30#include "dsputil.h"
44f54ceb
MN
31
32#include "vis.h"
33
d9420d4d
DB
34extern void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data);
35extern void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data);
36extern void ff_simple_idct_vis(DCTELEM *data);
37
44f54ceb
MN
38/* The trick used in some of this file is the formula from the MMX
39 * motion comp code, which is:
40 *
41 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
42 *
43 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
44 * We avoid overflows by masking before we do the shift, and we
45 * implement the shift by multiplying by 1/2 using mul8x16. So in
46 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
47 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
48 * the value 0x80808080 is in f8):
49 *
bb270c08
DB
50 * fxor f0, f2, f10
51 * fand f10, f4, f10
52 * fmul8x16 f8, f10, f10
53 * fand f10, f6, f10
54 * for f0, f2, f12
55 * fpsub16 f12, f10, f10
44f54ceb
MN
56 */
57
58#define ATTR_ALIGN(alignd) __attribute__ ((aligned(alignd)))
59
60#define DUP4(x) {x, x, x, x}
61#define DUP8(x) {x, x, x, x, x, x, x, x}
62static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
63static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
64static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
65static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
66static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
67static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
68static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
69static const int16_t constants256_512[] ATTR_ALIGN(8) =
bb270c08 70 {256, 512, 256, 512};
44f54ceb 71static const int16_t constants256_1024[] ATTR_ALIGN(8) =
bb270c08
DB
72 {256, 1024, 256, 1024};
73
74#define REF_0 0
75#define REF_0_1 1
76#define REF_2 2
77#define REF_2_1 3
78#define REF_4 4
79#define REF_4_1 5
80#define REF_6 6
81#define REF_6_1 7
82#define REF_S0 8
83#define REF_S0_1 9
84#define REF_S2 10
85#define REF_S2_1 11
86#define REF_S4 12
87#define REF_S4_1 13
88#define REF_S6 14
89#define REF_S6_1 15
90#define DST_0 16
91#define DST_1 17
92#define DST_2 18
93#define DST_3 19
94#define CONST_1 20
95#define CONST_2 20
96#define CONST_3 20
97#define CONST_6 20
98#define MASK_fe 20
99#define CONST_128 22
100#define CONST_256 22
101#define CONST_512 22
102#define CONST_1024 22
103#define TMP0 24
104#define TMP1 25
105#define TMP2 26
106#define TMP3 27
107#define TMP4 28
108#define TMP5 29
109#define ZERO 30
110#define MASK_7f 30
111
112#define TMP6 32
113#define TMP8 34
114#define TMP10 36
115#define TMP12 38
116#define TMP14 40
117#define TMP16 42
118#define TMP18 44
119#define TMP20 46
120#define TMP22 48
121#define TMP24 50
122#define TMP26 52
123#define TMP28 54
124#define TMP30 56
125#define TMP32 58
44f54ceb
MN
126
127static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 128 const int stride, int height)
44f54ceb 129{
bb270c08 130 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 131
bb270c08
DB
132 ref = vis_alignaddr(ref);
133 do { /* 5 cycles */
134 vis_ld64(ref[0], TMP0);
44f54ceb 135
bb270c08 136 vis_ld64_2(ref, 8, TMP2);
44f54ceb 137
bb270c08
DB
138 vis_ld64_2(ref, 16, TMP4);
139 ref += stride;
44f54ceb 140
bb270c08
DB
141 vis_faligndata(TMP0, TMP2, REF_0);
142 vis_st64(REF_0, dest[0]);
44f54ceb 143
bb270c08
DB
144 vis_faligndata(TMP2, TMP4, REF_2);
145 vis_st64_2(REF_2, dest, 8);
146 dest += stride;
147 } while (--height);
44f54ceb
MN
148}
149
150static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 151 const int stride, int height)
44f54ceb 152{
bb270c08 153 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 154
bb270c08
DB
155 ref = vis_alignaddr(ref);
156 do { /* 4 cycles */
157 vis_ld64(ref[0], TMP0);
44f54ceb 158
bb270c08
DB
159 vis_ld64(ref[8], TMP2);
160 ref += stride;
44f54ceb 161
bb270c08 162 /* stall */
44f54ceb 163
bb270c08
DB
164 vis_faligndata(TMP0, TMP2, REF_0);
165 vis_st64(REF_0, dest[0]);
166 dest += stride;
167 } while (--height);
44f54ceb
MN
168}
169
170
171static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 172 const int stride, int height)
44f54ceb 173{
bb270c08
DB
174 uint8_t *ref = (uint8_t *) _ref;
175 int stride_8 = stride + 8;
44f54ceb 176
bb270c08 177 ref = vis_alignaddr(ref);
44f54ceb 178
bb270c08 179 vis_ld64(ref[0], TMP0);
44f54ceb 180
bb270c08 181 vis_ld64(ref[8], TMP2);
44f54ceb 182
bb270c08 183 vis_ld64(ref[16], TMP4);
44f54ceb 184
bb270c08 185 vis_ld64(dest[0], DST_0);
44f54ceb 186
bb270c08 187 vis_ld64(dest[8], DST_2);
44f54ceb 188
bb270c08
DB
189 vis_ld64(constants_fe[0], MASK_fe);
190 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 191
bb270c08
DB
192 vis_ld64(constants_7f[0], MASK_7f);
193 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 194
bb270c08 195 vis_ld64(constants128[0], CONST_128);
44f54ceb 196
bb270c08
DB
197 ref += stride;
198 height = (height >> 1) - 1;
44f54ceb 199
bb270c08
DB
200 do { /* 24 cycles */
201 vis_ld64(ref[0], TMP0);
202 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 203
bb270c08
DB
204 vis_ld64_2(ref, 8, TMP2);
205 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 206
bb270c08
DB
207 vis_ld64_2(ref, 16, TMP4);
208 ref += stride;
209 vis_mul8x16(CONST_128, TMP6, TMP6);
210 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 211
bb270c08 212 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 213
bb270c08
DB
214 vis_or(DST_0, REF_0, TMP10);
215 vis_ld64_2(dest, stride, DST_0);
216 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 217
bb270c08
DB
218 vis_or(DST_2, REF_2, TMP12);
219 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 220
bb270c08
DB
221 vis_ld64(ref[0], TMP14);
222 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 223
bb270c08 224 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 225
bb270c08
DB
226 vis_psub16(TMP10, TMP6, TMP6);
227 vis_st64(TMP6, dest[0]);
44f54ceb 228
bb270c08
DB
229 vis_psub16(TMP12, TMP8, TMP8);
230 vis_st64_2(TMP8, dest, 8);
44f54ceb 231
bb270c08
DB
232 dest += stride;
233 vis_ld64_2(ref, 8, TMP16);
234 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 235
bb270c08
DB
236 vis_ld64_2(ref, 16, TMP18);
237 vis_faligndata(TMP2, TMP4, REF_2);
238 ref += stride;
44f54ceb 239
bb270c08 240 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 241
bb270c08 242 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 243
bb270c08
DB
244 vis_xor(DST_2, REF_2, TMP22);
245 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 246
bb270c08 247 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 248
bb270c08
DB
249 vis_or(DST_0, REF_0, TMP24);
250 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 251
bb270c08 252 vis_or(DST_2, REF_2, TMP26);
44f54ceb 253
bb270c08
DB
254 vis_ld64_2(dest, stride, DST_0);
255 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 256
bb270c08
DB
257 vis_ld64_2(dest, stride_8, DST_2);
258 vis_faligndata(TMP16, TMP18, REF_2);
44f54ceb 259
bb270c08 260 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 261
bb270c08 262 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 263
bb270c08
DB
264 vis_psub16(TMP24, TMP20, TMP20);
265 vis_st64(TMP20, dest[0]);
44f54ceb 266
bb270c08
DB
267 vis_psub16(TMP26, TMP22, TMP22);
268 vis_st64_2(TMP22, dest, 8);
269 dest += stride;
270 } while (--height);
44f54ceb 271
bb270c08
DB
272 vis_ld64(ref[0], TMP0);
273 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 274
bb270c08
DB
275 vis_ld64_2(ref, 8, TMP2);
276 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 277
bb270c08
DB
278 vis_ld64_2(ref, 16, TMP4);
279 vis_mul8x16(CONST_128, TMP6, TMP6);
280 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 281
bb270c08 282 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 283
bb270c08
DB
284 vis_or(DST_0, REF_0, TMP10);
285 vis_ld64_2(dest, stride, DST_0);
286 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 287
bb270c08
DB
288 vis_or(DST_2, REF_2, TMP12);
289 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 290
bb270c08
DB
291 vis_ld64(ref[0], TMP14);
292 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 293
bb270c08 294 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 295
bb270c08
DB
296 vis_psub16(TMP10, TMP6, TMP6);
297 vis_st64(TMP6, dest[0]);
44f54ceb 298
bb270c08
DB
299 vis_psub16(TMP12, TMP8, TMP8);
300 vis_st64_2(TMP8, dest, 8);
44f54ceb 301
bb270c08
DB
302 dest += stride;
303 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 304
bb270c08 305 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 306
bb270c08 307 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 308
bb270c08 309 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 310
bb270c08
DB
311 vis_xor(DST_2, REF_2, TMP22);
312 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 313
bb270c08 314 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 315
bb270c08
DB
316 vis_or(DST_0, REF_0, TMP24);
317 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 318
bb270c08 319 vis_or(DST_2, REF_2, TMP26);
44f54ceb 320
bb270c08 321 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 322
bb270c08 323 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 324
bb270c08
DB
325 vis_psub16(TMP24, TMP20, TMP20);
326 vis_st64(TMP20, dest[0]);
44f54ceb 327
bb270c08
DB
328 vis_psub16(TMP26, TMP22, TMP22);
329 vis_st64_2(TMP22, dest, 8);
44f54ceb
MN
330}
331
332static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 333 const int stride, int height)
44f54ceb 334{
bb270c08 335 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 336
bb270c08 337 ref = vis_alignaddr(ref);
44f54ceb 338
bb270c08 339 vis_ld64(ref[0], TMP0);
44f54ceb 340
bb270c08 341 vis_ld64(ref[8], TMP2);
44f54ceb 342
bb270c08 343 vis_ld64(dest[0], DST_0);
44f54ceb 344
bb270c08 345 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 346
bb270c08
DB
347 vis_ld64(constants_7f[0], MASK_7f);
348 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 349
bb270c08 350 vis_ld64(constants128[0], CONST_128);
44f54ceb 351
bb270c08
DB
352 ref += stride;
353 height = (height >> 1) - 1;
44f54ceb 354
bb270c08
DB
355 do { /* 12 cycles */
356 vis_ld64(ref[0], TMP0);
357 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 358
bb270c08
DB
359 vis_ld64(ref[8], TMP2);
360 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 361
bb270c08
DB
362 vis_or(DST_0, REF_0, TMP6);
363 vis_ld64_2(dest, stride, DST_0);
364 ref += stride;
365 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 366
bb270c08
DB
367 vis_ld64(ref[0], TMP12);
368 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 369
bb270c08
DB
370 vis_ld64(ref[8], TMP2);
371 vis_xor(DST_0, REF_0, TMP0);
372 ref += stride;
44f54ceb 373
bb270c08 374 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 375
bb270c08 376 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 377
bb270c08
DB
378 vis_psub16(TMP6, TMP4, TMP4);
379 vis_st64(TMP4, dest[0]);
380 dest += stride;
381 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 382
bb270c08
DB
383 vis_or(DST_0, REF_0, TMP6);
384 vis_ld64_2(dest, stride, DST_0);
44f54ceb 385
bb270c08 386 vis_faligndata(TMP12, TMP2, REF_0);
44f54ceb 387
bb270c08 388 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 389
bb270c08
DB
390 vis_psub16(TMP6, TMP0, TMP4);
391 vis_st64(TMP4, dest[0]);
392 dest += stride;
393 } while (--height);
44f54ceb 394
bb270c08
DB
395 vis_ld64(ref[0], TMP0);
396 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 397
bb270c08
DB
398 vis_ld64(ref[8], TMP2);
399 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 400
bb270c08
DB
401 vis_or(DST_0, REF_0, TMP6);
402 vis_ld64_2(dest, stride, DST_0);
403 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 404
bb270c08 405 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 406
bb270c08 407 vis_xor(DST_0, REF_0, TMP0);
44f54ceb 408
bb270c08 409 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 410
bb270c08 411 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 412
bb270c08
DB
413 vis_psub16(TMP6, TMP4, TMP4);
414 vis_st64(TMP4, dest[0]);
415 dest += stride;
416 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 417
bb270c08 418 vis_or(DST_0, REF_0, TMP6);
44f54ceb 419
bb270c08 420 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 421
bb270c08
DB
422 vis_psub16(TMP6, TMP0, TMP4);
423 vis_st64(TMP4, dest[0]);
44f54ceb
MN
424}
425
426static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 427 const int stride, int height)
44f54ceb 428{
bb270c08
DB
429 uint8_t *ref = (uint8_t *) _ref;
430 unsigned long off = (unsigned long) ref & 0x7;
431 unsigned long off_plus_1 = off + 1;
44f54ceb 432
bb270c08 433 ref = vis_alignaddr(ref);
44f54ceb 434
bb270c08 435 vis_ld64(ref[0], TMP0);
44f54ceb 436
bb270c08 437 vis_ld64_2(ref, 8, TMP2);
44f54ceb 438
bb270c08 439 vis_ld64_2(ref, 16, TMP4);
44f54ceb 440
bb270c08 441 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 442
bb270c08
DB
443 vis_ld64(constants_7f[0], MASK_7f);
444 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 445
bb270c08
DB
446 vis_ld64(constants128[0], CONST_128);
447 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 448
bb270c08
DB
449 if (off != 0x7) {
450 vis_alignaddr_g0((void *)off_plus_1);
451 vis_faligndata(TMP0, TMP2, REF_2);
452 vis_faligndata(TMP2, TMP4, REF_6);
453 } else {
454 vis_src1(TMP2, REF_2);
455 vis_src1(TMP4, REF_6);
456 }
44f54ceb 457
bb270c08
DB
458 ref += stride;
459 height = (height >> 1) - 1;
44f54ceb 460
bb270c08
DB
461 do { /* 34 cycles */
462 vis_ld64(ref[0], TMP0);
463 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 464
bb270c08
DB
465 vis_ld64_2(ref, 8, TMP2);
466 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 467
bb270c08
DB
468 vis_ld64_2(ref, 16, TMP4);
469 vis_and(TMP6, MASK_fe, TMP6);
470 ref += stride;
44f54ceb 471
bb270c08
DB
472 vis_ld64(ref[0], TMP14);
473 vis_mul8x16(CONST_128, TMP6, TMP6);
474 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 475
bb270c08
DB
476 vis_ld64_2(ref, 8, TMP16);
477 vis_mul8x16(CONST_128, TMP8, TMP8);
478 vis_or(REF_0, REF_2, TMP10);
44f54ceb 479
bb270c08
DB
480 vis_ld64_2(ref, 16, TMP18);
481 ref += stride;
482 vis_or(REF_4, REF_6, TMP12);
44f54ceb 483
bb270c08 484 vis_alignaddr_g0((void *)off);
44f54ceb 485
bb270c08 486 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 487
bb270c08 488 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 489
bb270c08
DB
490 if (off != 0x7) {
491 vis_alignaddr_g0((void *)off_plus_1);
492 vis_faligndata(TMP0, TMP2, REF_2);
493 vis_faligndata(TMP2, TMP4, REF_6);
494 } else {
495 vis_src1(TMP2, REF_2);
496 vis_src1(TMP4, REF_6);
497 }
44f54ceb 498
bb270c08 499 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 500
bb270c08 501 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 502
bb270c08
DB
503 vis_psub16(TMP10, TMP6, TMP6);
504 vis_st64(TMP6, dest[0]);
44f54ceb 505
bb270c08
DB
506 vis_psub16(TMP12, TMP8, TMP8);
507 vis_st64_2(TMP8, dest, 8);
508 dest += stride;
44f54ceb 509
bb270c08 510 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 511
bb270c08 512 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 513
bb270c08 514 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 515
bb270c08
DB
516 vis_mul8x16(CONST_128, TMP6, TMP6);
517 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 518
bb270c08
DB
519 vis_mul8x16(CONST_128, TMP8, TMP8);
520 vis_or(REF_0, REF_2, TMP10);
44f54ceb 521
bb270c08 522 vis_or(REF_4, REF_6, TMP12);
44f54ceb 523
bb270c08 524 vis_alignaddr_g0((void *)off);
44f54ceb 525
bb270c08 526 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 527
bb270c08 528 vis_faligndata(TMP16, TMP18, REF_4);
44f54ceb 529
bb270c08
DB
530 if (off != 0x7) {
531 vis_alignaddr_g0((void *)off_plus_1);
532 vis_faligndata(TMP14, TMP16, REF_2);
533 vis_faligndata(TMP16, TMP18, REF_6);
534 } else {
535 vis_src1(TMP16, REF_2);
536 vis_src1(TMP18, REF_6);
537 }
44f54ceb 538
bb270c08 539 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 540
bb270c08 541 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 542
bb270c08
DB
543 vis_psub16(TMP10, TMP6, TMP6);
544 vis_st64(TMP6, dest[0]);
44f54ceb 545
bb270c08
DB
546 vis_psub16(TMP12, TMP8, TMP8);
547 vis_st64_2(TMP8, dest, 8);
548 dest += stride;
549 } while (--height);
44f54ceb 550
bb270c08
DB
551 vis_ld64(ref[0], TMP0);
552 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 553
bb270c08
DB
554 vis_ld64_2(ref, 8, TMP2);
555 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 556
bb270c08
DB
557 vis_ld64_2(ref, 16, TMP4);
558 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 559
bb270c08
DB
560 vis_mul8x16(CONST_128, TMP6, TMP6);
561 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 562
bb270c08
DB
563 vis_mul8x16(CONST_128, TMP8, TMP8);
564 vis_or(REF_0, REF_2, TMP10);
44f54ceb 565
bb270c08 566 vis_or(REF_4, REF_6, TMP12);
44f54ceb 567
bb270c08 568 vis_alignaddr_g0((void *)off);
44f54ceb 569
bb270c08 570 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 571
bb270c08 572 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 573
bb270c08
DB
574 if (off != 0x7) {
575 vis_alignaddr_g0((void *)off_plus_1);
576 vis_faligndata(TMP0, TMP2, REF_2);
577 vis_faligndata(TMP2, TMP4, REF_6);
578 } else {
579 vis_src1(TMP2, REF_2);
580 vis_src1(TMP4, REF_6);
581 }
44f54ceb 582
bb270c08 583 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 584
bb270c08 585 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 586
bb270c08
DB
587 vis_psub16(TMP10, TMP6, TMP6);
588 vis_st64(TMP6, dest[0]);
44f54ceb 589
bb270c08
DB
590 vis_psub16(TMP12, TMP8, TMP8);
591 vis_st64_2(TMP8, dest, 8);
592 dest += stride;
44f54ceb 593
bb270c08 594 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 595
bb270c08 596 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 597
bb270c08 598 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 599
bb270c08
DB
600 vis_mul8x16(CONST_128, TMP6, TMP6);
601 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 602
bb270c08
DB
603 vis_mul8x16(CONST_128, TMP8, TMP8);
604 vis_or(REF_0, REF_2, TMP10);
44f54ceb 605
bb270c08 606 vis_or(REF_4, REF_6, TMP12);
44f54ceb 607
bb270c08 608 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 609
bb270c08 610 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 611
bb270c08
DB
612 vis_psub16(TMP10, TMP6, TMP6);
613 vis_st64(TMP6, dest[0]);
44f54ceb 614
bb270c08
DB
615 vis_psub16(TMP12, TMP8, TMP8);
616 vis_st64_2(TMP8, dest, 8);
44f54ceb
MN
617}
618
619static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 620 const int stride, int height)
44f54ceb 621{
bb270c08
DB
622 uint8_t *ref = (uint8_t *) _ref;
623 unsigned long off = (unsigned long) ref & 0x7;
624 unsigned long off_plus_1 = off + 1;
44f54ceb 625
bb270c08 626 ref = vis_alignaddr(ref);
44f54ceb 627
bb270c08 628 vis_ld64(ref[0], TMP0);
44f54ceb 629
bb270c08 630 vis_ld64(ref[8], TMP2);
44f54ceb 631
bb270c08 632 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 633
bb270c08 634 vis_ld64(constants_7f[0], MASK_7f);
44f54ceb 635
bb270c08
DB
636 vis_ld64(constants128[0], CONST_128);
637 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 638
bb270c08
DB
639 if (off != 0x7) {
640 vis_alignaddr_g0((void *)off_plus_1);
641 vis_faligndata(TMP0, TMP2, REF_2);
642 } else {
643 vis_src1(TMP2, REF_2);
644 }
44f54ceb 645
bb270c08
DB
646 ref += stride;
647 height = (height >> 1) - 1;
44f54ceb 648
bb270c08
DB
649 do { /* 20 cycles */
650 vis_ld64(ref[0], TMP0);
651 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 652
bb270c08
DB
653 vis_ld64_2(ref, 8, TMP2);
654 vis_and(TMP4, MASK_fe, TMP4);
655 ref += stride;
44f54ceb 656
bb270c08
DB
657 vis_ld64(ref[0], TMP8);
658 vis_or(REF_0, REF_2, TMP6);
659 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 660
bb270c08 661 vis_alignaddr_g0((void *)off);
44f54ceb 662
bb270c08
DB
663 vis_ld64_2(ref, 8, TMP10);
664 ref += stride;
665 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 666
bb270c08
DB
667 if (off != 0x7) {
668 vis_alignaddr_g0((void *)off_plus_1);
669 vis_faligndata(TMP0, TMP2, REF_2);
670 } else {
671 vis_src1(TMP2, REF_2);
672 }
44f54ceb 673
bb270c08 674 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 675
bb270c08
DB
676 vis_psub16(TMP6, TMP4, DST_0);
677 vis_st64(DST_0, dest[0]);
678 dest += stride;
44f54ceb 679
bb270c08 680 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 681
bb270c08 682 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 683
bb270c08
DB
684 vis_or(REF_0, REF_2, TMP14);
685 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 686
bb270c08
DB
687 vis_alignaddr_g0((void *)off);
688 vis_faligndata(TMP8, TMP10, REF_0);
689 if (off != 0x7) {
690 vis_alignaddr_g0((void *)off_plus_1);
691 vis_faligndata(TMP8, TMP10, REF_2);
692 } else {
693 vis_src1(TMP10, REF_2);
694 }
44f54ceb 695
bb270c08 696 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 697
bb270c08
DB
698 vis_psub16(TMP14, TMP12, DST_0);
699 vis_st64(DST_0, dest[0]);
700 dest += stride;
701 } while (--height);
44f54ceb 702
bb270c08
DB
703 vis_ld64(ref[0], TMP0);
704 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 705
bb270c08
DB
706 vis_ld64_2(ref, 8, TMP2);
707 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 708
bb270c08
DB
709 vis_or(REF_0, REF_2, TMP6);
710 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 711
bb270c08 712 vis_alignaddr_g0((void *)off);
44f54ceb 713
bb270c08 714 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 715
bb270c08
DB
716 if (off != 0x7) {
717 vis_alignaddr_g0((void *)off_plus_1);
718 vis_faligndata(TMP0, TMP2, REF_2);
719 } else {
720 vis_src1(TMP2, REF_2);
721 }
44f54ceb 722
bb270c08 723 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 724
bb270c08
DB
725 vis_psub16(TMP6, TMP4, DST_0);
726 vis_st64(DST_0, dest[0]);
727 dest += stride;
44f54ceb 728
bb270c08 729 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 730
bb270c08 731 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 732
bb270c08
DB
733 vis_or(REF_0, REF_2, TMP14);
734 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 735
bb270c08 736 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 737
bb270c08
DB
738 vis_psub16(TMP14, TMP12, DST_0);
739 vis_st64(DST_0, dest[0]);
740 dest += stride;
44f54ceb
MN
741}
742
743static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 744 const int stride, int height)
44f54ceb 745{
bb270c08
DB
746 uint8_t *ref = (uint8_t *) _ref;
747 unsigned long off = (unsigned long) ref & 0x7;
748 unsigned long off_plus_1 = off + 1;
44f54ceb 749
bb270c08 750 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 751
bb270c08
DB
752 vis_ld64(constants3[0], CONST_3);
753 vis_fzero(ZERO);
754 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 755
bb270c08
DB
756 ref = vis_alignaddr(ref);
757 do { /* 26 cycles */
758 vis_ld64(ref[0], TMP0);
44f54ceb 759
bb270c08 760 vis_ld64(ref[8], TMP2);
44f54ceb 761
bb270c08 762 vis_alignaddr_g0((void *)off);
44f54ceb 763
bb270c08 764 vis_ld64(ref[16], TMP4);
44f54ceb 765
bb270c08
DB
766 vis_ld64(dest[0], DST_0);
767 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 768
bb270c08
DB
769 vis_ld64(dest[8], DST_2);
770 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 771
bb270c08
DB
772 if (off != 0x7) {
773 vis_alignaddr_g0((void *)off_plus_1);
774 vis_faligndata(TMP0, TMP2, REF_2);
775 vis_faligndata(TMP2, TMP4, REF_6);
776 } else {
777 vis_src1(TMP2, REF_2);
778 vis_src1(TMP4, REF_6);
779 }
44f54ceb 780
bb270c08 781 vis_mul8x16au(REF_0, CONST_256, TMP0);
44f54ceb 782
bb270c08
DB
783 vis_pmerge(ZERO, REF_2, TMP4);
784 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 785
bb270c08 786 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 787
bb270c08 788 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 789
bb270c08
DB
790 vis_mul8x16al(DST_0, CONST_512, TMP4);
791 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 792
bb270c08 793 vis_mul8x16al(DST_1, CONST_512, TMP6);
44f54ceb 794
bb270c08 795 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 796
bb270c08
DB
797 vis_padd16(TMP0, TMP4, TMP0);
798 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 799
bb270c08
DB
800 vis_padd16(TMP2, TMP6, TMP2);
801 vis_mul8x16au(REF_4, CONST_256, TMP16);
44f54ceb 802
bb270c08
DB
803 vis_padd16(TMP0, CONST_3, TMP8);
804 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
44f54ceb 805
bb270c08
DB
806 vis_padd16(TMP2, CONST_3, TMP10);
807 vis_pack16(TMP8, DST_0);
44f54ceb 808
bb270c08
DB
809 vis_pack16(TMP10, DST_1);
810 vis_padd16(TMP16, TMP12, TMP0);
44f54ceb 811
bb270c08
DB
812 vis_st64(DST_0, dest[0]);
813 vis_mul8x16al(DST_2, CONST_512, TMP4);
814 vis_padd16(TMP18, TMP14, TMP2);
44f54ceb 815
bb270c08
DB
816 vis_mul8x16al(DST_3, CONST_512, TMP6);
817 vis_padd16(TMP0, CONST_3, TMP0);
44f54ceb 818
bb270c08 819 vis_padd16(TMP2, CONST_3, TMP2);
44f54ceb 820
bb270c08 821 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 822
bb270c08
DB
823 vis_padd16(TMP2, TMP6, TMP2);
824 vis_pack16(TMP0, DST_2);
44f54ceb 825
bb270c08
DB
826 vis_pack16(TMP2, DST_3);
827 vis_st64(DST_2, dest[8]);
44f54ceb 828
bb270c08
DB
829 ref += stride;
830 dest += stride;
831 } while (--height);
44f54ceb
MN
832}
833
834static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 835 const int stride, int height)
44f54ceb 836{
bb270c08
DB
837 uint8_t *ref = (uint8_t *) _ref;
838 unsigned long off = (unsigned long) ref & 0x7;
839 unsigned long off_plus_1 = off + 1;
840 int stride_times_2 = stride << 1;
44f54ceb 841
bb270c08 842 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 843
bb270c08
DB
844 vis_ld64(constants3[0], CONST_3);
845 vis_fzero(ZERO);
846 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 847
bb270c08
DB
848 ref = vis_alignaddr(ref);
849 height >>= 2;
850 do { /* 47 cycles */
851 vis_ld64(ref[0], TMP0);
44f54ceb 852
bb270c08
DB
853 vis_ld64_2(ref, 8, TMP2);
854 ref += stride;
44f54ceb 855
bb270c08 856 vis_alignaddr_g0((void *)off);
44f54ceb 857
bb270c08
DB
858 vis_ld64(ref[0], TMP4);
859 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 860
bb270c08
DB
861 vis_ld64_2(ref, 8, TMP6);
862 ref += stride;
44f54ceb 863
bb270c08 864 vis_ld64(ref[0], TMP8);
44f54ceb 865
bb270c08
DB
866 vis_ld64_2(ref, 8, TMP10);
867 ref += stride;
868 vis_faligndata(TMP4, TMP6, REF_4);
44f54ceb 869
bb270c08 870 vis_ld64(ref[0], TMP12);
44f54ceb 871
bb270c08
DB
872 vis_ld64_2(ref, 8, TMP14);
873 ref += stride;
874 vis_faligndata(TMP8, TMP10, REF_S0);
44f54ceb 875
bb270c08 876 vis_faligndata(TMP12, TMP14, REF_S4);
44f54ceb 877
bb270c08
DB
878 if (off != 0x7) {
879 vis_alignaddr_g0((void *)off_plus_1);
44f54ceb 880
bb270c08
DB
881 vis_ld64(dest[0], DST_0);
882 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 883
bb270c08
DB
884 vis_ld64_2(dest, stride, DST_2);
885 vis_faligndata(TMP4, TMP6, REF_6);
44f54ceb 886
bb270c08 887 vis_faligndata(TMP8, TMP10, REF_S2);
44f54ceb 888
bb270c08
DB
889 vis_faligndata(TMP12, TMP14, REF_S6);
890 } else {
891 vis_ld64(dest[0], DST_0);
892 vis_src1(TMP2, REF_2);
44f54ceb 893
bb270c08
DB
894 vis_ld64_2(dest, stride, DST_2);
895 vis_src1(TMP6, REF_6);
44f54ceb 896
bb270c08 897 vis_src1(TMP10, REF_S2);
44f54ceb 898
bb270c08
DB
899 vis_src1(TMP14, REF_S6);
900 }
44f54ceb 901
bb270c08
DB
902 vis_pmerge(ZERO, REF_0, TMP0);
903 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 904
bb270c08
DB
905 vis_pmerge(ZERO, REF_2, TMP4);
906 vis_mul8x16au(REF_2_1, CONST_256, TMP6);
44f54ceb 907
bb270c08
DB
908 vis_padd16(TMP0, CONST_3, TMP0);
909 vis_mul8x16al(DST_0, CONST_512, TMP16);
44f54ceb 910
bb270c08
DB
911 vis_padd16(TMP2, CONST_3, TMP2);
912 vis_mul8x16al(DST_1, CONST_512, TMP18);
44f54ceb 913
bb270c08
DB
914 vis_padd16(TMP0, TMP4, TMP0);
915 vis_mul8x16au(REF_4, CONST_256, TMP8);
44f54ceb 916
bb270c08
DB
917 vis_padd16(TMP2, TMP6, TMP2);
918 vis_mul8x16au(REF_4_1, CONST_256, TMP10);
44f54ceb 919
bb270c08
DB
920 vis_padd16(TMP0, TMP16, TMP0);
921 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 922
bb270c08
DB
923 vis_padd16(TMP2, TMP18, TMP2);
924 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 925
bb270c08
DB
926 vis_padd16(TMP8, CONST_3, TMP8);
927 vis_mul8x16al(DST_2, CONST_512, TMP16);
44f54ceb 928
bb270c08
DB
929 vis_padd16(TMP8, TMP12, TMP8);
930 vis_mul8x16al(DST_3, CONST_512, TMP18);
44f54ceb 931
bb270c08
DB
932 vis_padd16(TMP10, TMP14, TMP10);
933 vis_pack16(TMP0, DST_0);
44f54ceb 934
bb270c08
DB
935 vis_pack16(TMP2, DST_1);
936 vis_st64(DST_0, dest[0]);
937 dest += stride;
938 vis_padd16(TMP10, CONST_3, TMP10);
44f54ceb 939
bb270c08
DB
940 vis_ld64_2(dest, stride, DST_0);
941 vis_padd16(TMP8, TMP16, TMP8);
44f54ceb 942
bb270c08
DB
943 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
944 vis_padd16(TMP10, TMP18, TMP10);
945 vis_pack16(TMP8, DST_2);
44f54ceb 946
bb270c08
DB
947 vis_pack16(TMP10, DST_3);
948 vis_st64(DST_2, dest[0]);
949 dest += stride;
44f54ceb 950
bb270c08
DB
951 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
952 vis_pmerge(ZERO, REF_S0, TMP0);
44f54ceb 953
bb270c08
DB
954 vis_pmerge(ZERO, REF_S2, TMP24);
955 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
44f54ceb 956
bb270c08
DB
957 vis_padd16(TMP0, CONST_3, TMP0);
958 vis_mul8x16au(REF_S4, CONST_256, TMP8);
44f54ceb 959
bb270c08
DB
960 vis_padd16(TMP2, CONST_3, TMP2);
961 vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
44f54ceb 962
bb270c08
DB
963 vis_padd16(TMP0, TMP24, TMP0);
964 vis_mul8x16au(REF_S6, CONST_256, TMP12);
44f54ceb 965
bb270c08
DB
966 vis_padd16(TMP2, TMP6, TMP2);
967 vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
44f54ceb 968
bb270c08
DB
969 vis_padd16(TMP8, CONST_3, TMP8);
970 vis_mul8x16al(DST_0, CONST_512, TMP16);
44f54ceb 971
bb270c08
DB
972 vis_padd16(TMP10, CONST_3, TMP10);
973 vis_mul8x16al(DST_1, CONST_512, TMP18);
44f54ceb 974
bb270c08
DB
975 vis_padd16(TMP8, TMP12, TMP8);
976 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
44f54ceb 977
bb270c08
DB
978 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
979 vis_padd16(TMP0, TMP16, TMP0);
44f54ceb 980
bb270c08
DB
981 vis_padd16(TMP2, TMP18, TMP2);
982 vis_pack16(TMP0, DST_0);
44f54ceb 983
bb270c08
DB
984 vis_padd16(TMP10, TMP14, TMP10);
985 vis_pack16(TMP2, DST_1);
986 vis_st64(DST_0, dest[0]);
987 dest += stride;
44f54ceb 988
bb270c08 989 vis_padd16(TMP8, TMP20, TMP8);
44f54ceb 990
bb270c08
DB
991 vis_padd16(TMP10, TMP22, TMP10);
992 vis_pack16(TMP8, DST_2);
44f54ceb 993
bb270c08
DB
994 vis_pack16(TMP10, DST_3);
995 vis_st64(DST_2, dest[0]);
996 dest += stride;
997 } while (--height);
44f54ceb
MN
998}
999
1000static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1001 const int stride, int height)
44f54ceb 1002{
bb270c08 1003 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 1004
bb270c08
DB
1005 ref = vis_alignaddr(ref);
1006 vis_ld64(ref[0], TMP0);
44f54ceb 1007
bb270c08 1008 vis_ld64_2(ref, 8, TMP2);
44f54ceb 1009
bb270c08
DB
1010 vis_ld64_2(ref, 16, TMP4);
1011 ref += stride;
44f54ceb 1012
bb270c08
DB
1013 vis_ld64(ref[0], TMP6);
1014 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1015
bb270c08
DB
1016 vis_ld64_2(ref, 8, TMP8);
1017 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1018
bb270c08
DB
1019 vis_ld64_2(ref, 16, TMP10);
1020 ref += stride;
44f54ceb 1021
bb270c08
DB
1022 vis_ld64(constants_fe[0], MASK_fe);
1023 vis_faligndata(TMP6, TMP8, REF_2);
44f54ceb 1024
bb270c08
DB
1025 vis_ld64(constants_7f[0], MASK_7f);
1026 vis_faligndata(TMP8, TMP10, REF_6);
44f54ceb 1027
bb270c08
DB
1028 vis_ld64(constants128[0], CONST_128);
1029 height = (height >> 1) - 1;
1030 do { /* 24 cycles */
1031 vis_ld64(ref[0], TMP0);
1032 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1033
bb270c08
DB
1034 vis_ld64_2(ref, 8, TMP2);
1035 vis_xor(REF_4, REF_6, TMP16);
44f54ceb 1036
bb270c08
DB
1037 vis_ld64_2(ref, 16, TMP4);
1038 ref += stride;
1039 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1040
bb270c08
DB
1041 vis_ld64(ref[0], TMP6);
1042 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1043
bb270c08
DB
1044 vis_ld64_2(ref, 8, TMP8);
1045 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1046
bb270c08
DB
1047 vis_ld64_2(ref, 16, TMP10);
1048 ref += stride;
1049 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1050
bb270c08 1051 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1052
bb270c08
DB
1053 vis_and(TMP16, MASK_fe, TMP16);
1054 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 1055
bb270c08
DB
1056 vis_mul8x16(CONST_128, TMP16, TMP16);
1057 vis_xor(REF_0, REF_2, TMP0);
44f54ceb 1058
bb270c08 1059 vis_xor(REF_4, REF_6, TMP2);
44f54ceb 1060
bb270c08 1061 vis_or(REF_0, REF_2, TMP20);
44f54ceb 1062
bb270c08 1063 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1064
bb270c08 1065 vis_and(TMP16, MASK_7f, TMP16);
44f54ceb 1066
bb270c08
DB
1067 vis_psub16(TMP14, TMP12, TMP12);
1068 vis_st64(TMP12, dest[0]);
44f54ceb 1069
bb270c08
DB
1070 vis_psub16(TMP18, TMP16, TMP16);
1071 vis_st64_2(TMP16, dest, 8);
1072 dest += stride;
44f54ceb 1073
bb270c08 1074 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1075
bb270c08 1076 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 1077
bb270c08
DB
1078 vis_and(TMP2, MASK_fe, TMP2);
1079 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 1080
bb270c08
DB
1081 vis_faligndata(TMP6, TMP8, REF_2);
1082 vis_mul8x16(CONST_128, TMP2, TMP2);
44f54ceb 1083
bb270c08 1084 vis_faligndata(TMP8, TMP10, REF_6);
44f54ceb 1085
bb270c08 1086 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 1087
bb270c08 1088 vis_and(TMP2, MASK_7f, TMP2);
44f54ceb 1089
bb270c08
DB
1090 vis_psub16(TMP20, TMP0, TMP0);
1091 vis_st64(TMP0, dest[0]);
44f54ceb 1092
bb270c08
DB
1093 vis_psub16(TMP18, TMP2, TMP2);
1094 vis_st64_2(TMP2, dest, 8);
1095 dest += stride;
1096 } while (--height);
44f54ceb 1097
bb270c08
DB
1098 vis_ld64(ref[0], TMP0);
1099 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1100
bb270c08
DB
1101 vis_ld64_2(ref, 8, TMP2);
1102 vis_xor(REF_4, REF_6, TMP16);
44f54ceb 1103
bb270c08
DB
1104 vis_ld64_2(ref, 16, TMP4);
1105 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1106
bb270c08 1107 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1108
bb270c08 1109 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1110
bb270c08 1111 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1112
bb270c08 1113 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1114
bb270c08
DB
1115 vis_and(TMP16, MASK_fe, TMP16);
1116 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 1117
bb270c08
DB
1118 vis_mul8x16(CONST_128, TMP16, TMP16);
1119 vis_xor(REF_0, REF_2, TMP0);
44f54ceb 1120
bb270c08 1121 vis_xor(REF_4, REF_6, TMP2);
44f54ceb 1122
bb270c08 1123 vis_or(REF_0, REF_2, TMP20);
44f54ceb 1124
bb270c08 1125 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1126
bb270c08 1127 vis_and(TMP16, MASK_7f, TMP16);
44f54ceb 1128
bb270c08
DB
1129 vis_psub16(TMP14, TMP12, TMP12);
1130 vis_st64(TMP12, dest[0]);
44f54ceb 1131
bb270c08
DB
1132 vis_psub16(TMP18, TMP16, TMP16);
1133 vis_st64_2(TMP16, dest, 8);
1134 dest += stride;
44f54ceb 1135
bb270c08 1136 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1137
bb270c08 1138 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 1139
bb270c08
DB
1140 vis_and(TMP2, MASK_fe, TMP2);
1141 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 1142
bb270c08 1143 vis_mul8x16(CONST_128, TMP2, TMP2);
44f54ceb 1144
bb270c08 1145 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 1146
bb270c08 1147 vis_and(TMP2, MASK_7f, TMP2);
44f54ceb 1148
bb270c08
DB
1149 vis_psub16(TMP20, TMP0, TMP0);
1150 vis_st64(TMP0, dest[0]);
44f54ceb 1151
bb270c08
DB
1152 vis_psub16(TMP18, TMP2, TMP2);
1153 vis_st64_2(TMP2, dest, 8);
44f54ceb
MN
1154}
1155
1156static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1157 const int stride, int height)
44f54ceb 1158{
bb270c08 1159 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 1160
bb270c08
DB
1161 ref = vis_alignaddr(ref);
1162 vis_ld64(ref[0], TMP0);
44f54ceb 1163
bb270c08
DB
1164 vis_ld64_2(ref, 8, TMP2);
1165 ref += stride;
44f54ceb 1166
bb270c08 1167 vis_ld64(ref[0], TMP4);
44f54ceb 1168
bb270c08
DB
1169 vis_ld64_2(ref, 8, TMP6);
1170 ref += stride;
44f54ceb 1171
bb270c08
DB
1172 vis_ld64(constants_fe[0], MASK_fe);
1173 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1174
bb270c08
DB
1175 vis_ld64(constants_7f[0], MASK_7f);
1176 vis_faligndata(TMP4, TMP6, REF_2);
44f54ceb 1177
bb270c08
DB
1178 vis_ld64(constants128[0], CONST_128);
1179 height = (height >> 1) - 1;
1180 do { /* 12 cycles */
1181 vis_ld64(ref[0], TMP0);
1182 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 1183
bb270c08
DB
1184 vis_ld64_2(ref, 8, TMP2);
1185 ref += stride;
1186 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 1187
bb270c08
DB
1188 vis_or(REF_0, REF_2, TMP6);
1189 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 1190
bb270c08
DB
1191 vis_faligndata(TMP0, TMP2, REF_0);
1192 vis_ld64(ref[0], TMP0);
44f54ceb 1193
bb270c08
DB
1194 vis_ld64_2(ref, 8, TMP2);
1195 ref += stride;
1196 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1197
bb270c08 1198 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 1199
bb270c08 1200 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1201
bb270c08
DB
1202 vis_mul8x16(CONST_128, TMP12, TMP12);
1203 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1204
bb270c08
DB
1205 vis_psub16(TMP6, TMP4, DST_0);
1206 vis_st64(DST_0, dest[0]);
1207 dest += stride;
44f54ceb 1208
bb270c08 1209 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1210
bb270c08 1211 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1212
bb270c08
DB
1213 vis_psub16(TMP14, TMP12, DST_0);
1214 vis_st64(DST_0, dest[0]);
1215 dest += stride;
1216 } while (--height);
44f54ceb 1217
bb270c08
DB
1218 vis_ld64(ref[0], TMP0);
1219 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 1220
bb270c08
DB
1221 vis_ld64_2(ref, 8, TMP2);
1222 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 1223
bb270c08
DB
1224 vis_or(REF_0, REF_2, TMP6);
1225 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 1226
bb270c08 1227 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1228
bb270c08 1229 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1230
bb270c08 1231 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 1232
bb270c08 1233 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1234
bb270c08
DB
1235 vis_mul8x16(CONST_128, TMP12, TMP12);
1236 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1237
bb270c08
DB
1238 vis_psub16(TMP6, TMP4, DST_0);
1239 vis_st64(DST_0, dest[0]);
1240 dest += stride;
44f54ceb 1241
bb270c08 1242 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1243
bb270c08
DB
1244 vis_psub16(TMP14, TMP12, DST_0);
1245 vis_st64(DST_0, dest[0]);
44f54ceb
MN
1246}
1247
1248static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1249 const int stride, int height)
44f54ceb 1250{
bb270c08
DB
1251 uint8_t *ref = (uint8_t *) _ref;
1252 int stride_8 = stride + 8;
1253 int stride_16 = stride + 16;
44f54ceb 1254
bb270c08 1255 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1256
bb270c08 1257 ref = vis_alignaddr(ref);
44f54ceb 1258
bb270c08
DB
1259 vis_ld64(ref[ 0], TMP0);
1260 vis_fzero(ZERO);
44f54ceb 1261
bb270c08 1262 vis_ld64(ref[ 8], TMP2);
44f54ceb 1263
bb270c08 1264 vis_ld64(ref[16], TMP4);
44f54ceb 1265
bb270c08
DB
1266 vis_ld64(constants3[0], CONST_3);
1267 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1268
bb270c08
DB
1269 vis_ld64(constants256_512[0], CONST_256);
1270 vis_faligndata(TMP2, TMP4, REF_6);
1271 height >>= 1;
44f54ceb 1272
bb270c08
DB
1273 do { /* 31 cycles */
1274 vis_ld64_2(ref, stride, TMP0);
1275 vis_pmerge(ZERO, REF_2, TMP12);
1276 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
44f54ceb 1277
bb270c08
DB
1278 vis_ld64_2(ref, stride_8, TMP2);
1279 vis_pmerge(ZERO, REF_6, TMP16);
1280 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
44f54ceb 1281
bb270c08
DB
1282 vis_ld64_2(ref, stride_16, TMP4);
1283 ref += stride;
44f54ceb 1284
bb270c08
DB
1285 vis_ld64(dest[0], DST_0);
1286 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1287
bb270c08
DB
1288 vis_ld64_2(dest, 8, DST_2);
1289 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1290
bb270c08
DB
1291 vis_ld64_2(ref, stride, TMP6);
1292 vis_pmerge(ZERO, REF_0, TMP0);
1293 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 1294
bb270c08
DB
1295 vis_ld64_2(ref, stride_8, TMP8);
1296 vis_pmerge(ZERO, REF_4, TMP4);
44f54ceb 1297
bb270c08
DB
1298 vis_ld64_2(ref, stride_16, TMP10);
1299 ref += stride;
44f54ceb 1300
bb270c08
DB
1301 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
1302 vis_faligndata(TMP6, TMP8, REF_2);
1303 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
44f54ceb 1304
bb270c08
DB
1305 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
1306 vis_faligndata(TMP8, TMP10, REF_6);
1307 vis_mul8x16al(DST_0, CONST_512, TMP20);
44f54ceb 1308
bb270c08
DB
1309 vis_padd16(TMP0, CONST_3, TMP0);
1310 vis_mul8x16al(DST_1, CONST_512, TMP22);
44f54ceb 1311
bb270c08
DB
1312 vis_padd16(TMP2, CONST_3, TMP2);
1313 vis_mul8x16al(DST_2, CONST_512, TMP24);
44f54ceb 1314
bb270c08
DB
1315 vis_padd16(TMP4, CONST_3, TMP4);
1316 vis_mul8x16al(DST_3, CONST_512, TMP26);
44f54ceb 1317
bb270c08 1318 vis_padd16(TMP6, CONST_3, TMP6);
44f54ceb 1319
bb270c08
DB
1320 vis_padd16(TMP12, TMP20, TMP12);
1321 vis_mul8x16al(REF_S0, CONST_512, TMP20);
44f54ceb 1322
bb270c08
DB
1323 vis_padd16(TMP14, TMP22, TMP14);
1324 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
44f54ceb 1325
bb270c08
DB
1326 vis_padd16(TMP16, TMP24, TMP16);
1327 vis_mul8x16al(REF_S2, CONST_512, TMP24);
44f54ceb 1328
bb270c08
DB
1329 vis_padd16(TMP18, TMP26, TMP18);
1330 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
44f54ceb 1331
bb270c08
DB
1332 vis_padd16(TMP12, TMP0, TMP12);
1333 vis_mul8x16au(REF_2, CONST_256, TMP28);
44f54ceb 1334
bb270c08
DB
1335 vis_padd16(TMP14, TMP2, TMP14);
1336 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
44f54ceb 1337
bb270c08
DB
1338 vis_padd16(TMP16, TMP4, TMP16);
1339 vis_mul8x16au(REF_6, CONST_256, REF_S4);
44f54ceb 1340
bb270c08
DB
1341 vis_padd16(TMP18, TMP6, TMP18);
1342 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
44f54ceb 1343
bb270c08
DB
1344 vis_pack16(TMP12, DST_0);
1345 vis_padd16(TMP28, TMP0, TMP12);
44f54ceb 1346
bb270c08
DB
1347 vis_pack16(TMP14, DST_1);
1348 vis_st64(DST_0, dest[0]);
1349 vis_padd16(TMP30, TMP2, TMP14);
44f54ceb 1350
bb270c08
DB
1351 vis_pack16(TMP16, DST_2);
1352 vis_padd16(REF_S4, TMP4, TMP16);
44f54ceb 1353
bb270c08
DB
1354 vis_pack16(TMP18, DST_3);
1355 vis_st64_2(DST_2, dest, 8);
1356 dest += stride;
1357 vis_padd16(REF_S6, TMP6, TMP18);
44f54ceb 1358
bb270c08 1359 vis_padd16(TMP12, TMP20, TMP12);
44f54ceb 1360
bb270c08
DB
1361 vis_padd16(TMP14, TMP22, TMP14);
1362 vis_pack16(TMP12, DST_0);
44f54ceb 1363
bb270c08
DB
1364 vis_padd16(TMP16, TMP24, TMP16);
1365 vis_pack16(TMP14, DST_1);
1366 vis_st64(DST_0, dest[0]);
44f54ceb 1367
bb270c08
DB
1368 vis_padd16(TMP18, TMP26, TMP18);
1369 vis_pack16(TMP16, DST_2);
44f54ceb 1370
bb270c08
DB
1371 vis_pack16(TMP18, DST_3);
1372 vis_st64_2(DST_2, dest, 8);
1373 dest += stride;
1374 } while (--height);
44f54ceb
MN
1375}
1376
1377static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1378 const int stride, int height)
44f54ceb 1379{
bb270c08
DB
1380 uint8_t *ref = (uint8_t *) _ref;
1381 int stride_8 = stride + 8;
44f54ceb 1382
bb270c08 1383 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1384
bb270c08 1385 ref = vis_alignaddr(ref);
44f54ceb 1386
bb270c08
DB
1387 vis_ld64(ref[ 0], TMP0);
1388 vis_fzero(ZERO);
44f54ceb 1389
bb270c08 1390 vis_ld64(ref[ 8], TMP2);
44f54ceb 1391
bb270c08
DB
1392 vis_ld64(constants3[0], CONST_3);
1393 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1394
bb270c08 1395 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 1396
bb270c08
DB
1397 height >>= 1;
1398 do { /* 20 cycles */
1399 vis_ld64_2(ref, stride, TMP0);
1400 vis_pmerge(ZERO, REF_2, TMP8);
1401 vis_mul8x16au(REF_2_1, CONST_256, TMP10);
44f54ceb 1402
bb270c08
DB
1403 vis_ld64_2(ref, stride_8, TMP2);
1404 ref += stride;
44f54ceb 1405
bb270c08 1406 vis_ld64(dest[0], DST_0);
44f54ceb 1407
bb270c08
DB
1408 vis_ld64_2(dest, stride, DST_2);
1409 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1410
bb270c08
DB
1411 vis_ld64_2(ref, stride, TMP4);
1412 vis_mul8x16al(DST_0, CONST_512, TMP16);
1413 vis_pmerge(ZERO, REF_0, TMP12);
44f54ceb 1414
bb270c08
DB
1415 vis_ld64_2(ref, stride_8, TMP6);
1416 ref += stride;
1417 vis_mul8x16al(DST_1, CONST_512, TMP18);
1418 vis_pmerge(ZERO, REF_0_1, TMP14);
44f54ceb 1419
bb270c08
DB
1420 vis_padd16(TMP12, CONST_3, TMP12);
1421 vis_mul8x16al(DST_2, CONST_512, TMP24);
44f54ceb 1422
bb270c08
DB
1423 vis_padd16(TMP14, CONST_3, TMP14);
1424 vis_mul8x16al(DST_3, CONST_512, TMP26);
44f54ceb 1425
bb270c08 1426 vis_faligndata(TMP4, TMP6, REF_2);
44f54ceb 1427
bb270c08 1428 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1429
bb270c08
DB
1430 vis_padd16(TMP10, TMP14, TMP10);
1431 vis_mul8x16au(REF_2, CONST_256, TMP20);
44f54ceb 1432
bb270c08
DB
1433 vis_padd16(TMP8, TMP16, TMP0);
1434 vis_mul8x16au(REF_2_1, CONST_256, TMP22);
44f54ceb 1435
bb270c08
DB
1436 vis_padd16(TMP10, TMP18, TMP2);
1437 vis_pack16(TMP0, DST_0);
44f54ceb 1438
bb270c08
DB
1439 vis_pack16(TMP2, DST_1);
1440 vis_st64(DST_0, dest[0]);
1441 dest += stride;
1442 vis_padd16(TMP12, TMP20, TMP12);
44f54ceb 1443
bb270c08 1444 vis_padd16(TMP14, TMP22, TMP14);
44f54ceb 1445
bb270c08 1446 vis_padd16(TMP12, TMP24, TMP0);
44f54ceb 1447
bb270c08
DB
1448 vis_padd16(TMP14, TMP26, TMP2);
1449 vis_pack16(TMP0, DST_2);
44f54ceb 1450
bb270c08
DB
1451 vis_pack16(TMP2, DST_3);
1452 vis_st64(DST_2, dest[0]);
1453 dest += stride;
1454 } while (--height);
44f54ceb
MN
1455}
1456
1457static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1458 const int stride, int height)
44f54ceb 1459{
bb270c08
DB
1460 uint8_t *ref = (uint8_t *) _ref;
1461 unsigned long off = (unsigned long) ref & 0x7;
1462 unsigned long off_plus_1 = off + 1;
1463 int stride_8 = stride + 8;
1464 int stride_16 = stride + 16;
44f54ceb 1465
bb270c08 1466 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1467
bb270c08 1468 ref = vis_alignaddr(ref);
44f54ceb 1469
bb270c08
DB
1470 vis_ld64(ref[ 0], TMP0);
1471 vis_fzero(ZERO);
44f54ceb 1472
bb270c08 1473 vis_ld64(ref[ 8], TMP2);
44f54ceb 1474
bb270c08 1475 vis_ld64(ref[16], TMP4);
44f54ceb 1476
bb270c08
DB
1477 vis_ld64(constants2[0], CONST_2);
1478 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1479
bb270c08
DB
1480 vis_ld64(constants256_512[0], CONST_256);
1481 vis_faligndata(TMP2, TMP4, REF_S4);
44f54ceb 1482
bb270c08
DB
1483 if (off != 0x7) {
1484 vis_alignaddr_g0((void *)off_plus_1);
1485 vis_faligndata(TMP0, TMP2, REF_S2);
1486 vis_faligndata(TMP2, TMP4, REF_S6);
1487 } else {
1488 vis_src1(TMP2, REF_S2);
1489 vis_src1(TMP4, REF_S6);
1490 }
44f54ceb 1491
bb270c08
DB
1492 height >>= 1;
1493 do {
1494 vis_ld64_2(ref, stride, TMP0);
1495 vis_mul8x16au(REF_S0, CONST_256, TMP12);
1496 vis_pmerge(ZERO, REF_S0_1, TMP14);
44f54ceb 1497
bb270c08 1498 vis_alignaddr_g0((void *)off);
44f54ceb 1499
bb270c08
DB
1500 vis_ld64_2(ref, stride_8, TMP2);
1501 vis_mul8x16au(REF_S2, CONST_256, TMP16);
1502 vis_pmerge(ZERO, REF_S2_1, TMP18);
44f54ceb 1503
bb270c08
DB
1504 vis_ld64_2(ref, stride_16, TMP4);
1505 ref += stride;
1506 vis_mul8x16au(REF_S4, CONST_256, TMP20);
1507 vis_pmerge(ZERO, REF_S4_1, TMP22);
44f54ceb 1508
bb270c08
DB
1509 vis_ld64_2(ref, stride, TMP6);
1510 vis_mul8x16au(REF_S6, CONST_256, TMP24);
1511 vis_pmerge(ZERO, REF_S6_1, TMP26);
44f54ceb 1512
bb270c08
DB
1513 vis_ld64_2(ref, stride_8, TMP8);
1514 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1515
bb270c08
DB
1516 vis_ld64_2(ref, stride_16, TMP10);
1517 ref += stride;
1518 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1519
bb270c08 1520 vis_faligndata(TMP6, TMP8, REF_S0);
44f54ceb 1521
bb270c08 1522 vis_faligndata(TMP8, TMP10, REF_S4);
44f54ceb 1523
bb270c08
DB
1524 if (off != 0x7) {
1525 vis_alignaddr_g0((void *)off_plus_1);
1526 vis_faligndata(TMP0, TMP2, REF_2);
1527 vis_faligndata(TMP2, TMP4, REF_6);
1528 vis_faligndata(TMP6, TMP8, REF_S2);
1529 vis_faligndata(TMP8, TMP10, REF_S6);
1530 } else {
1531 vis_src1(TMP2, REF_2);
1532 vis_src1(TMP4, REF_6);
1533 vis_src1(TMP8, REF_S2);
1534 vis_src1(TMP10, REF_S6);
1535 }
44f54ceb 1536
bb270c08
DB
1537 vis_mul8x16au(REF_0, CONST_256, TMP0);
1538 vis_pmerge(ZERO, REF_0_1, TMP2);
44f54ceb 1539
bb270c08
DB
1540 vis_mul8x16au(REF_2, CONST_256, TMP4);
1541 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 1542
bb270c08
DB
1543 vis_padd16(TMP0, CONST_2, TMP8);
1544 vis_mul8x16au(REF_4, CONST_256, TMP0);
44f54ceb 1545
bb270c08
DB
1546 vis_padd16(TMP2, CONST_2, TMP10);
1547 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
44f54ceb 1548
bb270c08
DB
1549 vis_padd16(TMP8, TMP4, TMP8);
1550 vis_mul8x16au(REF_6, CONST_256, TMP4);
44f54ceb 1551
bb270c08
DB
1552 vis_padd16(TMP10, TMP6, TMP10);
1553 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
44f54ceb 1554
bb270c08 1555 vis_padd16(TMP12, TMP8, TMP12);
44f54ceb 1556
bb270c08 1557 vis_padd16(TMP14, TMP10, TMP14);
44f54ceb 1558
bb270c08 1559 vis_padd16(TMP12, TMP16, TMP12);
44f54ceb 1560
bb270c08
DB
1561 vis_padd16(TMP14, TMP18, TMP14);
1562 vis_pack16(TMP12, DST_0);
44f54ceb 1563
bb270c08
DB
1564 vis_pack16(TMP14, DST_1);
1565 vis_st64(DST_0, dest[0]);
1566 vis_padd16(TMP0, CONST_2, TMP12);
44f54ceb 1567
bb270c08
DB
1568 vis_mul8x16au(REF_S0, CONST_256, TMP0);
1569 vis_padd16(TMP2, CONST_2, TMP14);
44f54ceb 1570
bb270c08
DB
1571 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
1572 vis_padd16(TMP12, TMP4, TMP12);
44f54ceb 1573
bb270c08
DB
1574 vis_mul8x16au(REF_S2, CONST_256, TMP4);
1575 vis_padd16(TMP14, TMP6, TMP14);
44f54ceb 1576
bb270c08
DB
1577 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
1578 vis_padd16(TMP20, TMP12, TMP20);
44f54ceb 1579
bb270c08 1580 vis_padd16(TMP22, TMP14, TMP22);
44f54ceb 1581
bb270c08 1582 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1583
bb270c08
DB
1584 vis_padd16(TMP22, TMP26, TMP22);
1585 vis_pack16(TMP20, DST_2);
44f54ceb 1586
bb270c08
DB
1587 vis_pack16(TMP22, DST_3);
1588 vis_st64_2(DST_2, dest, 8);
1589 dest += stride;
1590 vis_padd16(TMP0, TMP4, TMP24);
44f54ceb 1591
bb270c08
DB
1592 vis_mul8x16au(REF_S4, CONST_256, TMP0);
1593 vis_padd16(TMP2, TMP6, TMP26);
44f54ceb 1594
bb270c08
DB
1595 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
1596 vis_padd16(TMP24, TMP8, TMP24);
44f54ceb 1597
bb270c08
DB
1598 vis_padd16(TMP26, TMP10, TMP26);
1599 vis_pack16(TMP24, DST_0);
44f54ceb 1600
bb270c08
DB
1601 vis_pack16(TMP26, DST_1);
1602 vis_st64(DST_0, dest[0]);
1603 vis_pmerge(ZERO, REF_S6, TMP4);
44f54ceb 1604
bb270c08 1605 vis_pmerge(ZERO, REF_S6_1, TMP6);
44f54ceb 1606
bb270c08 1607 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 1608
bb270c08 1609 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 1610
bb270c08 1611 vis_padd16(TMP0, TMP12, TMP0);
44f54ceb 1612
bb270c08
DB
1613 vis_padd16(TMP2, TMP14, TMP2);
1614 vis_pack16(TMP0, DST_2);
44f54ceb 1615
bb270c08
DB
1616 vis_pack16(TMP2, DST_3);
1617 vis_st64_2(DST_2, dest, 8);
1618 dest += stride;
1619 } while (--height);
44f54ceb
MN
1620}
1621
1622static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1623 const int stride, int height)
44f54ceb 1624{
bb270c08
DB
1625 uint8_t *ref = (uint8_t *) _ref;
1626 unsigned long off = (unsigned long) ref & 0x7;
1627 unsigned long off_plus_1 = off + 1;
1628 int stride_8 = stride + 8;
44f54ceb 1629
bb270c08 1630 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1631
bb270c08 1632 ref = vis_alignaddr(ref);
44f54ceb 1633
bb270c08
DB
1634 vis_ld64(ref[ 0], TMP0);
1635 vis_fzero(ZERO);
44f54ceb 1636
bb270c08 1637 vis_ld64(ref[ 8], TMP2);
44f54ceb 1638
bb270c08 1639 vis_ld64(constants2[0], CONST_2);
44f54ceb 1640
bb270c08
DB
1641 vis_ld64(constants256_512[0], CONST_256);
1642 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1643
bb270c08
DB
1644 if (off != 0x7) {
1645 vis_alignaddr_g0((void *)off_plus_1);
1646 vis_faligndata(TMP0, TMP2, REF_S2);
1647 } else {
1648 vis_src1(TMP2, REF_S2);
1649 }
44f54ceb 1650
bb270c08
DB
1651 height >>= 1;
1652 do { /* 26 cycles */
1653 vis_ld64_2(ref, stride, TMP0);
1654 vis_mul8x16au(REF_S0, CONST_256, TMP8);
1655 vis_pmerge(ZERO, REF_S2, TMP12);
44f54ceb 1656
bb270c08 1657 vis_alignaddr_g0((void *)off);
44f54ceb 1658
bb270c08
DB
1659 vis_ld64_2(ref, stride_8, TMP2);
1660 ref += stride;
1661 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
1662 vis_pmerge(ZERO, REF_S2_1, TMP14);
44f54ceb 1663
bb270c08 1664 vis_ld64_2(ref, stride, TMP4);
44f54ceb 1665
bb270c08
DB
1666 vis_ld64_2(ref, stride_8, TMP6);
1667 ref += stride;
1668 vis_faligndata(TMP0, TMP2, REF_S4);
44f54ceb 1669
bb270c08 1670 vis_pmerge(ZERO, REF_S4, TMP18);
44f54ceb 1671
bb270c08 1672 vis_pmerge(ZERO, REF_S4_1, TMP20);
44f54ceb 1673
bb270c08 1674 vis_faligndata(TMP4, TMP6, REF_S0);
44f54ceb 1675
bb270c08
DB
1676 if (off != 0x7) {
1677 vis_alignaddr_g0((void *)off_plus_1);
1678 vis_faligndata(TMP0, TMP2, REF_S6);
1679 vis_faligndata(TMP4, TMP6, REF_S2);
1680 } else {
1681 vis_src1(TMP2, REF_S6);
1682 vis_src1(TMP6, REF_S2);
1683 }
44f54ceb 1684
bb270c08
DB
1685 vis_padd16(TMP18, CONST_2, TMP18);
1686 vis_mul8x16au(REF_S6, CONST_256, TMP22);
44f54ceb 1687
bb270c08
DB
1688 vis_padd16(TMP20, CONST_2, TMP20);
1689 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
44f54ceb 1690
bb270c08
DB
1691 vis_mul8x16au(REF_S0, CONST_256, TMP26);
1692 vis_pmerge(ZERO, REF_S0_1, TMP28);
44f54ceb 1693
bb270c08
DB
1694 vis_mul8x16au(REF_S2, CONST_256, TMP30);
1695 vis_padd16(TMP18, TMP22, TMP18);
44f54ceb 1696
bb270c08
DB
1697 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
1698 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1699
bb270c08 1700 vis_padd16(TMP8, TMP18, TMP8);
44f54ceb 1701
bb270c08 1702 vis_padd16(TMP10, TMP20, TMP10);
44f54ceb 1703
bb270c08 1704 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1705
bb270c08
DB
1706 vis_padd16(TMP10, TMP14, TMP10);
1707 vis_pack16(TMP8, DST_0);
44f54ceb 1708
bb270c08
DB
1709 vis_pack16(TMP10, DST_1);
1710 vis_st64(DST_0, dest[0]);
1711 dest += stride;
1712 vis_padd16(TMP18, TMP26, TMP18);
44f54ceb 1713
bb270c08 1714 vis_padd16(TMP20, TMP28, TMP20);
44f54ceb 1715
bb270c08 1716 vis_padd16(TMP18, TMP30, TMP18);
44f54ceb 1717
bb270c08
DB
1718 vis_padd16(TMP20, TMP32, TMP20);
1719 vis_pack16(TMP18, DST_2);
44f54ceb 1720
bb270c08
DB
1721 vis_pack16(TMP20, DST_3);
1722 vis_st64(DST_2, dest[0]);
1723 dest += stride;
1724 } while (--height);
44f54ceb
MN
1725}
1726
1727static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1728 const int stride, int height)
44f54ceb 1729{
bb270c08
DB
1730 uint8_t *ref = (uint8_t *) _ref;
1731 unsigned long off = (unsigned long) ref & 0x7;
1732 unsigned long off_plus_1 = off + 1;
1733 int stride_8 = stride + 8;
1734 int stride_16 = stride + 16;
44f54ceb 1735
bb270c08 1736 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1737
bb270c08 1738 ref = vis_alignaddr(ref);
44f54ceb 1739
bb270c08
DB
1740 vis_ld64(ref[ 0], TMP0);
1741 vis_fzero(ZERO);
44f54ceb 1742
bb270c08 1743 vis_ld64(ref[ 8], TMP2);
44f54ceb 1744
bb270c08 1745 vis_ld64(ref[16], TMP4);
44f54ceb 1746
bb270c08
DB
1747 vis_ld64(constants6[0], CONST_6);
1748 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1749
bb270c08
DB
1750 vis_ld64(constants256_1024[0], CONST_256);
1751 vis_faligndata(TMP2, TMP4, REF_S4);
44f54ceb 1752
bb270c08
DB
1753 if (off != 0x7) {
1754 vis_alignaddr_g0((void *)off_plus_1);
1755 vis_faligndata(TMP0, TMP2, REF_S2);
1756 vis_faligndata(TMP2, TMP4, REF_S6);
1757 } else {
1758 vis_src1(TMP2, REF_S2);
1759 vis_src1(TMP4, REF_S6);
1760 }
44f54ceb 1761
bb270c08
DB
1762 height >>= 1;
1763 do { /* 55 cycles */
1764 vis_ld64_2(ref, stride, TMP0);
1765 vis_mul8x16au(REF_S0, CONST_256, TMP12);
1766 vis_pmerge(ZERO, REF_S0_1, TMP14);
44f54ceb 1767
bb270c08 1768 vis_alignaddr_g0((void *)off);
44f54ceb 1769
bb270c08
DB
1770 vis_ld64_2(ref, stride_8, TMP2);
1771 vis_mul8x16au(REF_S2, CONST_256, TMP16);
1772 vis_pmerge(ZERO, REF_S2_1, TMP18);
44f54ceb 1773
bb270c08
DB
1774 vis_ld64_2(ref, stride_16, TMP4);
1775 ref += stride;
1776 vis_mul8x16au(REF_S4, CONST_256, TMP20);
1777 vis_pmerge(ZERO, REF_S4_1, TMP22);
44f54ceb 1778
bb270c08
DB
1779 vis_ld64_2(ref, stride, TMP6);
1780 vis_mul8x16au(REF_S6, CONST_256, TMP24);
1781 vis_pmerge(ZERO, REF_S6_1, TMP26);
44f54ceb 1782
bb270c08
DB
1783 vis_ld64_2(ref, stride_8, TMP8);
1784 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1785
bb270c08
DB
1786 vis_ld64_2(ref, stride_16, TMP10);
1787 ref += stride;
1788 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1789
bb270c08
DB
1790 vis_ld64(dest[0], DST_0);
1791 vis_faligndata(TMP6, TMP8, REF_S0);
44f54ceb 1792
bb270c08
DB
1793 vis_ld64_2(dest, 8, DST_2);
1794 vis_faligndata(TMP8, TMP10, REF_S4);
44f54ceb 1795
bb270c08
DB
1796 if (off != 0x7) {
1797 vis_alignaddr_g0((void *)off_plus_1);
1798 vis_faligndata(TMP0, TMP2, REF_2);
1799 vis_faligndata(TMP2, TMP4, REF_6);
1800 vis_faligndata(TMP6, TMP8, REF_S2);
1801 vis_faligndata(TMP8, TMP10, REF_S6);
1802 } else {
1803 vis_src1(TMP2, REF_2);
1804 vis_src1(TMP4, REF_6);
1805 vis_src1(TMP8, REF_S2);
1806 vis_src1(TMP10, REF_S6);
1807 }
44f54ceb 1808
bb270c08
DB
1809 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1810 vis_pmerge(ZERO, REF_0, TMP0);
44f54ceb 1811
bb270c08
DB
1812 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1813 vis_pmerge(ZERO, REF_0_1, TMP2);
44f54ceb 1814
bb270c08
DB
1815 vis_mul8x16au(REF_2, CONST_256, TMP4);
1816 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 1817
bb270c08
DB
1818 vis_mul8x16al(DST_2, CONST_1024, REF_0);
1819 vis_padd16(TMP0, CONST_6, TMP0);
44f54ceb 1820
bb270c08
DB
1821 vis_mul8x16al(DST_3, CONST_1024, REF_2);
1822 vis_padd16(TMP2, CONST_6, TMP2);
44f54ceb 1823
bb270c08
DB
1824 vis_padd16(TMP0, TMP4, TMP0);
1825 vis_mul8x16au(REF_4, CONST_256, TMP4);
44f54ceb 1826
bb270c08
DB
1827 vis_padd16(TMP2, TMP6, TMP2);
1828 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
44f54ceb 1829
bb270c08
DB
1830 vis_padd16(TMP12, TMP0, TMP12);
1831 vis_mul8x16au(REF_6, CONST_256, TMP8);
44f54ceb 1832
bb270c08
DB
1833 vis_padd16(TMP14, TMP2, TMP14);
1834 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
44f54ceb 1835
bb270c08
DB
1836 vis_padd16(TMP12, TMP16, TMP12);
1837 vis_mul8x16au(REF_S0, CONST_256, REF_4);
44f54ceb 1838
bb270c08
DB
1839 vis_padd16(TMP14, TMP18, TMP14);
1840 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
44f54ceb 1841
bb270c08 1842 vis_padd16(TMP12, TMP30, TMP12);
44f54ceb 1843
bb270c08
DB
1844 vis_padd16(TMP14, TMP32, TMP14);
1845 vis_pack16(TMP12, DST_0);
44f54ceb 1846
bb270c08
DB
1847 vis_pack16(TMP14, DST_1);
1848 vis_st64(DST_0, dest[0]);
1849 vis_padd16(TMP4, CONST_6, TMP4);
44f54ceb 1850
bb270c08
DB
1851 vis_ld64_2(dest, stride, DST_0);
1852 vis_padd16(TMP6, CONST_6, TMP6);
1853 vis_mul8x16au(REF_S2, CONST_256, TMP12);
44f54ceb 1854
bb270c08
DB
1855 vis_padd16(TMP4, TMP8, TMP4);
1856 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
44f54ceb 1857
bb270c08 1858 vis_padd16(TMP6, TMP10, TMP6);
44f54ceb 1859
bb270c08 1860 vis_padd16(TMP20, TMP4, TMP20);
44f54ceb 1861
bb270c08 1862 vis_padd16(TMP22, TMP6, TMP22);
44f54ceb 1863
bb270c08 1864 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1865
bb270c08 1866 vis_padd16(TMP22, TMP26, TMP22);
44f54ceb 1867
bb270c08
DB
1868 vis_padd16(TMP20, REF_0, TMP20);
1869 vis_mul8x16au(REF_S4, CONST_256, REF_0);
44f54ceb 1870
bb270c08
DB
1871 vis_padd16(TMP22, REF_2, TMP22);
1872 vis_pack16(TMP20, DST_2);
44f54ceb 1873
bb270c08
DB
1874 vis_pack16(TMP22, DST_3);
1875 vis_st64_2(DST_2, dest, 8);
1876 dest += stride;
44f54ceb 1877
bb270c08
DB
1878 vis_ld64_2(dest, 8, DST_2);
1879 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1880 vis_pmerge(ZERO, REF_S4_1, REF_2);
44f54ceb 1881
bb270c08
DB
1882 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1883 vis_padd16(REF_4, TMP0, TMP8);
44f54ceb 1884
bb270c08
DB
1885 vis_mul8x16au(REF_S6, CONST_256, REF_4);
1886 vis_padd16(REF_6, TMP2, TMP10);
44f54ceb 1887
bb270c08
DB
1888 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
1889 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1890
bb270c08 1891 vis_padd16(TMP10, TMP14, TMP10);
44f54ceb 1892
bb270c08 1893 vis_padd16(TMP8, TMP30, TMP8);
44f54ceb 1894
bb270c08
DB
1895 vis_padd16(TMP10, TMP32, TMP10);
1896 vis_pack16(TMP8, DST_0);
44f54ceb 1897
bb270c08
DB
1898 vis_pack16(TMP10, DST_1);
1899 vis_st64(DST_0, dest[0]);
44f54ceb 1900
bb270c08 1901 vis_padd16(REF_0, TMP4, REF_0);
44f54ceb 1902
bb270c08
DB
1903 vis_mul8x16al(DST_2, CONST_1024, TMP30);
1904 vis_padd16(REF_2, TMP6, REF_2);
44f54ceb 1905
bb270c08
DB
1906 vis_mul8x16al(DST_3, CONST_1024, TMP32);
1907 vis_padd16(REF_0, REF_4, REF_0);
44f54ceb 1908
bb270c08 1909 vis_padd16(REF_2, REF_6, REF_2);
44f54ceb 1910
bb270c08 1911 vis_padd16(REF_0, TMP30, REF_0);
44f54ceb 1912
bb270c08 1913 /* stall */
44f54ceb 1914
bb270c08
DB
1915 vis_padd16(REF_2, TMP32, REF_2);
1916 vis_pack16(REF_0, DST_2);
44f54ceb 1917
bb270c08
DB
1918 vis_pack16(REF_2, DST_3);
1919 vis_st64_2(DST_2, dest, 8);
1920 dest += stride;
1921 } while (--height);
44f54ceb
MN
1922}
1923
1924static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1925 const int stride, int height)
44f54ceb 1926{
bb270c08
DB
1927 uint8_t *ref = (uint8_t *) _ref;
1928 unsigned long off = (unsigned long) ref & 0x7;
1929 unsigned long off_plus_1 = off + 1;
1930 int stride_8 = stride + 8;
44f54ceb 1931
bb270c08 1932 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1933
bb270c08 1934 ref = vis_alignaddr(ref);
44f54ceb 1935
bb270c08
DB
1936 vis_ld64(ref[0], TMP0);
1937 vis_fzero(ZERO);
44f54ceb 1938
bb270c08 1939 vis_ld64_2(ref, 8, TMP2);
44f54ceb 1940
bb270c08 1941 vis_ld64(constants6[0], CONST_6);
44f54ceb 1942
bb270c08
DB
1943 vis_ld64(constants256_1024[0], CONST_256);
1944 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1945
bb270c08
DB
1946 if (off != 0x7) {
1947 vis_alignaddr_g0((void *)off_plus_1);
1948 vis_faligndata(TMP0, TMP2, REF_S2);
1949 } else {
1950 vis_src1(TMP2, REF_S2);
1951 }
44f54ceb 1952
bb270c08
DB
1953 height >>= 1;
1954 do { /* 31 cycles */
1955 vis_ld64_2(ref, stride, TMP0);
1956 vis_mul8x16au(REF_S0, CONST_256, TMP8);
1957 vis_pmerge(ZERO, REF_S0_1, TMP10);
44f54ceb 1958
bb270c08
DB
1959 vis_ld64_2(ref, stride_8, TMP2);
1960 ref += stride;
1961 vis_mul8x16au(REF_S2, CONST_256, TMP12);
1962 vis_pmerge(ZERO, REF_S2_1, TMP14);
44f54ceb 1963
bb270c08 1964 vis_alignaddr_g0((void *)off);
44f54ceb 1965
bb270c08
DB
1966 vis_ld64_2(ref, stride, TMP4);
1967 vis_faligndata(TMP0, TMP2, REF_S4);
44f54ceb 1968
bb270c08
DB
1969 vis_ld64_2(ref, stride_8, TMP6);
1970 ref += stride;
44f54ceb 1971
bb270c08
DB
1972 vis_ld64(dest[0], DST_0);
1973 vis_faligndata(TMP4, TMP6, REF_S0);
44f54ceb 1974
bb270c08 1975 vis_ld64_2(dest, stride, DST_2);
44f54ceb 1976
bb270c08
DB
1977 if (off != 0x7) {
1978 vis_alignaddr_g0((void *)off_plus_1);
1979 vis_faligndata(TMP0, TMP2, REF_S6);
1980 vis_faligndata(TMP4, TMP6, REF_S2);
1981 } else {
1982 vis_src1(TMP2, REF_S6);
1983 vis_src1(TMP6, REF_S2);
1984 }
44f54ceb 1985
bb270c08
DB
1986 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1987 vis_pmerge(ZERO, REF_S4, TMP22);
44f54ceb 1988
bb270c08
DB
1989 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1990 vis_pmerge(ZERO, REF_S4_1, TMP24);
44f54ceb 1991
bb270c08
DB
1992 vis_mul8x16au(REF_S6, CONST_256, TMP26);
1993 vis_pmerge(ZERO, REF_S6_1, TMP28);
44f54ceb 1994
bb270c08
DB
1995 vis_mul8x16au(REF_S0, CONST_256, REF_S4);
1996 vis_padd16(TMP22, CONST_6, TMP22);
44f54ceb 1997
bb270c08
DB
1998 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
1999 vis_padd16(TMP24, CONST_6, TMP24);
44f54ceb 2000
bb270c08
DB
2001 vis_mul8x16al(DST_2, CONST_1024, REF_0);
2002 vis_padd16(TMP22, TMP26, TMP22);
44f54ceb 2003
bb270c08
DB
2004 vis_mul8x16al(DST_3, CONST_1024, REF_2);
2005 vis_padd16(TMP24, TMP28, TMP24);
44f54ceb 2006
bb270c08
DB
2007 vis_mul8x16au(REF_S2, CONST_256, TMP26);
2008 vis_padd16(TMP8, TMP22, TMP8);
44f54ceb 2009
bb270c08
DB
2010 vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
2011 vis_padd16(TMP10, TMP24, TMP10);
44f54ceb 2012
bb270c08 2013 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 2014
bb270c08 2015 vis_padd16(TMP10, TMP14, TMP10);
44f54ceb 2016
bb270c08 2017 vis_padd16(TMP8, TMP30, TMP8);
44f54ceb 2018
bb270c08
DB
2019 vis_padd16(TMP10, TMP32, TMP10);
2020 vis_pack16(TMP8, DST_0);
44f54ceb 2021
bb270c08
DB
2022 vis_pack16(TMP10, DST_1);
2023 vis_st64(DST_0, dest[0]);
2024 dest += stride;
44f54ceb 2025
bb270c08 2026 vis_padd16(REF_S4, TMP22, TMP12);
44f54ceb 2027
bb270c08 2028 vis_padd16(REF_S6, TMP24, TMP14);
44f54ceb 2029
bb270c08 2030 vis_padd16(TMP12, TMP26, TMP12);
44f54ceb 2031
bb270c08 2032 vis_padd16(TMP14, TMP28, TMP14);
44f54ceb 2033
bb270c08 2034 vis_padd16(TMP12, REF_0, TMP12);
44f54ceb 2035
bb270c08
DB
2036 vis_padd16(TMP14, REF_2, TMP14);
2037 vis_pack16(TMP12, DST_2);
44f54ceb 2038
bb270c08
DB
2039 vis_pack16(TMP14, DST_3);
2040 vis_st64(DST_2, dest[0]);
2041 dest += stride;
2042 } while (--height);
44f54ceb
MN
2043}
2044
2045/* End of rounding code */
2046
2047/* Start of no rounding code */
2048/* The trick used in some of this file is the formula from the MMX
2049 * motion comp code, which is:
2050 *
2051 * (x+y)>>1 == (x&y)+((x^y)>>1)
2052 *
2053 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
2054 * We avoid overflows by masking before we do the shift, and we
2055 * implement the shift by multiplying by 1/2 using mul8x16. So in
2056 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
2057 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
2058 * the value 0x80808080 is in f8):
2059 *
bb270c08
DB
2060 * fxor f0, f2, f10
2061 * fand f10, f4, f10
2062 * fmul8x16 f8, f10, f10
2063 * fand f10, f6, f10
2064 * fand f0, f2, f12
2065 * fpadd16 f12, f10, f10
44f54ceb
MN
2066 */
2067
2068static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2069 const int stride, int height)
44f54ceb 2070{
bb270c08 2071 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 2072
bb270c08
DB
2073 ref = vis_alignaddr(ref);
2074 do { /* 5 cycles */
2075 vis_ld64(ref[0], TMP0);
44f54ceb 2076
bb270c08 2077 vis_ld64_2(ref, 8, TMP2);
44f54ceb 2078
bb270c08
DB
2079 vis_ld64_2(ref, 16, TMP4);
2080 ref += stride;
44f54ceb 2081
bb270c08
DB
2082 vis_faligndata(TMP0, TMP2, REF_0);
2083 vis_st64(REF_0, dest[0]);
44f54ceb 2084
bb270c08
DB
2085 vis_faligndata(TMP2, TMP4, REF_2);
2086 vis_st64_2(REF_2, dest, 8);
2087 dest += stride;
2088 } while (--height);
44f54ceb
MN
2089}
2090
2091static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2092 const int stride, int height)
44f54ceb 2093{
bb270c08 2094 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 2095
bb270c08
DB
2096 ref = vis_alignaddr(ref);
2097 do { /* 4 cycles */
2098 vis_ld64(ref[0], TMP0);
44f54ceb 2099
bb270c08
DB
2100 vis_ld64(ref[8], TMP2);
2101 ref += stride;
44f54ceb 2102
bb270c08 2103 /* stall */
44f54ceb 2104
bb270c08
DB
2105 vis_faligndata(TMP0, TMP2, REF_0);
2106 vis_st64(REF_0, dest[0]);
2107 dest += stride;
2108 } while (--height);
44f54ceb
MN
2109}
2110
2111
2112static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2113 const int stride, int height)
44f54ceb 2114{
bb270c08
DB
2115 uint8_t *ref = (uint8_t *) _ref;
2116 int stride_8 = stride + 8;
44f54ceb 2117
bb270c08 2118 ref = vis_alignaddr(ref);
44f54ceb 2119
bb270c08 2120 vis_ld64(ref[0], TMP0);
44f54ceb 2121
bb270c08 2122 vis_ld64(ref[8], TMP2);
44f54ceb 2123
bb270c08 2124 vis_ld64(ref[16], TMP4);
44f54ceb 2125
bb270c08 2126 vis_ld64(dest[0], DST_0);
44f54ceb 2127
bb270c08 2128 vis_ld64(dest[8], DST_2);
44f54ceb 2129
bb270c08
DB
2130 vis_ld64(constants_fe[0], MASK_fe);
2131 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2132
bb270c08
DB
2133 vis_ld64(constants_7f[0], MASK_7f);
2134 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 2135
bb270c08 2136 vis_ld64(constants128[0], CONST_128);
44f54ceb 2137
bb270c08
DB
2138 ref += stride;
2139 height = (height >> 1) - 1;
44f54ceb 2140
bb270c08
DB
2141 do { /* 24 cycles */
2142 vis_ld64(ref[0], TMP0);
2143 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 2144
bb270c08
DB
2145 vis_ld64_2(ref, 8, TMP2);
2146 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2147
bb270c08
DB
2148 vis_ld64_2(ref, 16, TMP4);
2149 ref += stride;
2150 vis_mul8x16(CONST_128, TMP6, TMP6);
2151 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 2152
bb270c08 2153 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2154
bb270c08
DB
2155 vis_and(DST_0, REF_0, TMP10);
2156 vis_ld64_2(dest, stride, DST_0);
2157 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 2158
bb270c08
DB
2159 vis_and(DST_2, REF_2, TMP12);
2160 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 2161
bb270c08
DB
2162 vis_ld64(ref[0], TMP14);
2163 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2164
bb270c08 2165 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2166
bb270c08
DB
2167 vis_padd16(TMP10, TMP6, TMP6);
2168 vis_st64(TMP6, dest[0]);
44f54ceb 2169
bb270c08
DB
2170 vis_padd16(TMP12, TMP8, TMP8);
2171 vis_st64_2(TMP8, dest, 8);
44f54ceb 2172
bb270c08
DB
2173 dest += stride;
2174 vis_ld64_2(ref, 8, TMP16);
2175 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2176
bb270c08
DB
2177 vis_ld64_2(ref, 16, TMP18);
2178 vis_faligndata(TMP2, TMP4, REF_2);
2179 ref += stride;
44f54ceb 2180
bb270c08 2181 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 2182
bb270c08 2183 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 2184
bb270c08
DB
2185 vis_xor(DST_2, REF_2, TMP22);
2186 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 2187
bb270c08 2188 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 2189
bb270c08
DB
2190 vis_and(DST_0, REF_0, TMP24);
2191 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 2192
bb270c08 2193 vis_and(DST_2, REF_2, TMP26);
44f54ceb 2194
bb270c08
DB
2195 vis_ld64_2(dest, stride, DST_0);
2196 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 2197
bb270c08
DB
2198 vis_ld64_2(dest, stride_8, DST_2);
2199 vis_faligndata(TMP16, TMP18, REF_2);
44f54ceb 2200
bb270c08 2201 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 2202
bb270c08 2203 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 2204
bb270c08
DB
2205 vis_padd16(TMP24, TMP20, TMP20);
2206 vis_st64(TMP20, dest[0]);
44f54ceb 2207
bb270c08
DB
2208 vis_padd16(TMP26, TMP22, TMP22);
2209 vis_st64_2(TMP22, dest, 8);
2210 dest += stride;
2211 } while (--height);
44f54ceb 2212
bb270c08
DB
2213 vis_ld64(ref[0], TMP0);
2214 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 2215
bb270c08
DB
2216 vis_ld64_2(ref, 8, TMP2);
2217 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2218
bb270c08
DB
2219 vis_ld64_2(ref, 16, TMP4);
2220 vis_mul8x16(CONST_128, TMP6, TMP6);
2221 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 2222
bb270c08 2223 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2224
bb270c08
DB
2225 vis_and(DST_0, REF_0, TMP10);
2226 vis_ld64_2(dest, stride, DST_0);
2227 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 2228
bb270c08
DB
2229 vis_and(DST_2, REF_2, TMP12);
2230 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 2231
bb270c08
DB
2232 vis_ld64(ref[0], TMP14);
2233 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2234
bb270c08 2235 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2236
bb270c08
DB
2237 vis_padd16(TMP10, TMP6, TMP6);
2238 vis_st64(TMP6, dest[0]);
44f54ceb 2239
bb270c08
DB
2240 vis_padd16(TMP12, TMP8, TMP8);
2241 vis_st64_2(TMP8, dest, 8);
44f54ceb 2242
bb270c08
DB
2243 dest += stride;
2244 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2245
bb270c08 2246 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 2247
bb270c08 2248 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 2249
bb270c08 2250 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 2251
bb270c08
DB
2252 vis_xor(DST_2, REF_2, TMP22);
2253 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 2254
bb270c08 2255 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 2256
bb270c08
DB
2257 vis_and(DST_0, REF_0, TMP24);
2258 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 2259
bb270c08 2260 vis_and(DST_2, REF_2, TMP26);
44f54ceb 2261
bb270c08 2262 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 2263
bb270c08 2264 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 2265
bb270c08
DB
2266 vis_padd16(TMP24, TMP20, TMP20);
2267 vis_st64(TMP20, dest[0]);
44f54ceb 2268
bb270c08
DB
2269 vis_padd16(TMP26, TMP22, TMP22);
2270 vis_st64_2(TMP22, dest, 8);
44f54ceb
MN
2271}
2272
2273static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2274 const int stride, int height)
44f54ceb 2275{
bb270c08 2276 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 2277
bb270c08 2278 ref = vis_alignaddr(ref);
44f54ceb 2279
bb270c08 2280 vis_ld64(ref[0], TMP0);
44f54ceb 2281
bb270c08 2282 vis_ld64(ref[8], TMP2);
44f54ceb 2283
bb270c08 2284 vis_ld64(dest[0], DST_0);
44f54ceb 2285
bb270c08 2286 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2287
bb270c08
DB
2288 vis_ld64(constants_7f[0], MASK_7f);
2289 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2290
bb270c08 2291 vis_ld64(constants128[0], CONST_128);
44f54ceb 2292
bb270c08
DB
2293 ref += stride;
2294 height = (height >> 1) - 1;
44f54ceb 2295
bb270c08
DB
2296 do { /* 12 cycles */
2297 vis_ld64(ref[0], TMP0);
2298 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 2299
bb270c08
DB
2300 vis_ld64(ref[8], TMP2);
2301 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2302
bb270c08
DB
2303 vis_and(DST_0, REF_0, TMP6);
2304 vis_ld64_2(dest, stride, DST_0);
2305 ref += stride;
2306 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2307
bb270c08
DB
2308 vis_ld64(ref[0], TMP12);
2309 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2310
bb270c08
DB
2311 vis_ld64(ref[8], TMP2);
2312 vis_xor(DST_0, REF_0, TMP0);
2313 ref += stride;
44f54ceb 2314
bb270c08 2315 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 2316
bb270c08 2317 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2318
bb270c08
DB
2319 vis_padd16(TMP6, TMP4, TMP4);
2320 vis_st64(TMP4, dest[0]);
2321 dest += stride;
2322 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 2323
bb270c08
DB
2324 vis_and(DST_0, REF_0, TMP6);
2325 vis_ld64_2(dest, stride, DST_0);
44f54ceb 2326
bb270c08 2327 vis_faligndata(TMP12, TMP2, REF_0);
44f54ceb 2328
bb270c08 2329 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 2330
bb270c08
DB
2331 vis_padd16(TMP6, TMP0, TMP4);
2332 vis_st64(TMP4, dest[0]);
2333 dest += stride;
2334 } while (--height);
44f54ceb 2335
bb270c08
DB
2336 vis_ld64(ref[0], TMP0);
2337 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 2338
bb270c08
DB
2339 vis_ld64(ref[8], TMP2);
2340 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2341
bb270c08
DB
2342 vis_and(DST_0, REF_0, TMP6);
2343 vis_ld64_2(dest, stride, DST_0);
2344 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2345
bb270c08 2346 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2347
bb270c08 2348 vis_xor(DST_0, REF_0, TMP0);
44f54ceb 2349
bb270c08 2350 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 2351
bb270c08 2352 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2353
bb270c08
DB
2354 vis_padd16(TMP6, TMP4, TMP4);
2355 vis_st64(TMP4, dest[0]);
2356 dest += stride;
2357 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 2358
bb270c08 2359 vis_and(DST_0, REF_0, TMP6);
44f54ceb 2360
bb270c08 2361 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 2362
bb270c08
DB
2363 vis_padd16(TMP6, TMP0, TMP4);
2364 vis_st64(TMP4, dest[0]);
44f54ceb
MN
2365}
2366
2367static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2368 const int stride, int height)
44f54ceb 2369{
bb270c08
DB
2370 uint8_t *ref = (uint8_t *) _ref;
2371 unsigned long off = (unsigned long) ref & 0x7;
2372 unsigned long off_plus_1 = off + 1;
44f54ceb 2373
bb270c08 2374 ref = vis_alignaddr(ref);
44f54ceb 2375
bb270c08 2376 vis_ld64(ref[0], TMP0);
44f54ceb 2377
bb270c08 2378 vis_ld64_2(ref, 8, TMP2);
44f54ceb 2379
bb270c08 2380 vis_ld64_2(ref, 16, TMP4);
44f54ceb 2381
bb270c08 2382 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2383
bb270c08
DB
2384 vis_ld64(constants_7f[0], MASK_7f);
2385 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2386
bb270c08
DB
2387 vis_ld64(constants128[0], CONST_128);
2388 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2389
bb270c08
DB
2390 if (off != 0x7) {
2391 vis_alignaddr_g0((void *)off_plus_1);
2392 vis_faligndata(TMP0, TMP2, REF_2);
2393 vis_faligndata(TMP2, TMP4, REF_6);
2394 } else {
2395 vis_src1(TMP2, REF_2);
2396 vis_src1(TMP4, REF_6);
2397 }
44f54ceb 2398
bb270c08
DB
2399 ref += stride;
2400 height = (height >> 1) - 1;
44f54ceb 2401
bb270c08
DB
2402 do { /* 34 cycles */
2403 vis_ld64(ref[0], TMP0);
2404 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2405
bb270c08
DB
2406 vis_ld64_2(ref, 8, TMP2);
2407 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2408
bb270c08
DB
2409 vis_ld64_2(ref, 16, TMP4);
2410 vis_and(TMP6, MASK_fe, TMP6);
2411 ref += stride;
44f54ceb 2412
bb270c08
DB
2413 vis_ld64(ref[0], TMP14);
2414 vis_mul8x16(CONST_128, TMP6, TMP6);
2415 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2416
bb270c08
DB
2417 vis_ld64_2(ref, 8, TMP16);
2418 vis_mul8x16(CONST_128, TMP8, TMP8);
2419 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2420
bb270c08
DB
2421 vis_ld64_2(ref, 16, TMP18);
2422 ref += stride;
2423 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2424
bb270c08 2425 vis_alignaddr_g0((void *)off);
44f54ceb 2426
bb270c08 2427 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2428
bb270c08 2429 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2430
bb270c08
DB
2431 if (off != 0x7) {
2432 vis_alignaddr_g0((void *)off_plus_1);
2433 vis_faligndata(TMP0, TMP2, REF_2);
2434 vis_faligndata(TMP2, TMP4, REF_6);
2435 } else {
2436 vis_src1(TMP2, REF_2);
2437 vis_src1(TMP4, REF_6);
2438 }
44f54ceb 2439
bb270c08 2440 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2441
bb270c08 2442 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2443
bb270c08
DB
2444 vis_padd16(TMP10, TMP6, TMP6);
2445 vis_st64(TMP6, dest[0]);
44f54ceb 2446
bb270c08
DB
2447 vis_padd16(TMP12, TMP8, TMP8);
2448 vis_st64_2(TMP8, dest, 8);
2449 dest += stride;
44f54ceb 2450
bb270c08 2451 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2452
bb270c08 2453 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2454
bb270c08 2455 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2456
bb270c08
DB
2457 vis_mul8x16(CONST_128, TMP6, TMP6);
2458 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2459
bb270c08
DB
2460 vis_mul8x16(CONST_128, TMP8, TMP8);
2461 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2462
bb270c08 2463 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2464
bb270c08 2465 vis_alignaddr_g0((void *)off);
44f54ceb 2466
bb270c08 2467 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 2468
bb270c08 2469 vis_faligndata(TMP16, TMP18, REF_4);
44f54ceb 2470
bb270c08
DB
2471 if (off != 0x7) {
2472 vis_alignaddr_g0((void *)off_plus_1);
2473 vis_faligndata(TMP14, TMP16, REF_2);
2474 vis_faligndata(TMP16, TMP18, REF_6);
2475 } else {
2476 vis_src1(TMP16, REF_2);
2477 vis_src1(TMP18, REF_6);
2478 }
44f54ceb 2479
bb270c08 2480 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2481
bb270c08 2482 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2483
bb270c08
DB
2484 vis_padd16(TMP10, TMP6, TMP6);
2485 vis_st64(TMP6, dest[0]);
44f54ceb 2486
bb270c08
DB
2487 vis_padd16(TMP12, TMP8, TMP8);
2488 vis_st64_2(TMP8, dest, 8);
2489 dest += stride;
2490 } while (--height);
44f54ceb 2491
bb270c08
DB
2492 vis_ld64(ref[0], TMP0);
2493 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2494
bb270c08
DB
2495 vis_ld64_2(ref, 8, TMP2);
2496 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2497
bb270c08
DB
2498 vis_ld64_2(ref, 16, TMP4);
2499 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2500
bb270c08
DB
2501 vis_mul8x16(CONST_128, TMP6, TMP6);
2502 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2503
bb270c08
DB
2504 vis_mul8x16(CONST_128, TMP8, TMP8);
2505 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2506
bb270c08 2507 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2508
bb270c08 2509 vis_alignaddr_g0((void *)off);
44f54ceb 2510
bb270c08 2511 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2512
bb270c08 2513 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2514
bb270c08
DB
2515 if (off != 0x7) {
2516 vis_alignaddr_g0((void *)off_plus_1);
2517 vis_faligndata(TMP0, TMP2, REF_2);
2518 vis_faligndata(TMP2, TMP4, REF_6);
2519 } else {
2520 vis_src1(TMP2, REF_2);
2521 vis_src1(TMP4, REF_6);
2522 }
44f54ceb 2523
bb270c08 2524 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2525
bb270c08 2526 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2527
bb270c08
DB
2528 vis_padd16(TMP10, TMP6, TMP6);
2529 vis_st64(TMP6, dest[0]);
44f54ceb 2530
bb270c08
DB
2531 vis_padd16(TMP12, TMP8, TMP8);
2532 vis_st64_2(TMP8, dest, 8);
2533 dest += stride;
44f54ceb 2534
bb270c08 2535 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2536
bb270c08 2537 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2538
bb270c08 2539 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2540
bb270c08
DB
2541 vis_mul8x16(CONST_128, TMP6, TMP6);
2542 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2543
bb270c08
DB
2544 vis_mul8x16(CONST_128, TMP8, TMP8);
2545 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2546
bb270c08 2547 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2548
bb270c08 2549 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2550
bb270c08 2551 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2552
bb270c08
DB
2553 vis_padd16(TMP10, TMP6, TMP6);
2554 vis_st64(TMP6, dest[0]);
44f54ceb 2555
bb270c08
DB
2556 vis_padd16(TMP12, TMP8, TMP8);
2557 vis_st64_2(TMP8, dest, 8);
44f54ceb
MN
2558}
2559
2560static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2561 const int stride, int height)
44f54ceb 2562{
bb270c08
DB
2563 uint8_t *ref = (uint8_t *) _ref;
2564 unsigned long off = (unsigned long) ref & 0x7;
2565 unsigned long off_plus_1 = off + 1;
44f54ceb 2566
bb270c08 2567 ref = vis_alignaddr(ref);
44f54ceb 2568
bb270c08 2569 vis_ld64(ref[0], TMP0);
44f54ceb 2570
bb270c08 2571 vis_ld64(ref[8], TMP2);
44f54ceb 2572
bb270c08 2573 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2574
bb270c08 2575 vis_ld64(constants_7f[0], MASK_7f);
44f54ceb 2576
bb270c08
DB
2577 vis_ld64(constants128[0], CONST_128);
2578 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2579
bb270c08
DB
2580 if (off != 0x7) {
2581 vis_alignaddr_g0((void *)off_plus_1);
2582 vis_faligndata(TMP0, TMP2, REF_2);
2583 } else {
2584 vis_src1(TMP2, REF_2);
2585 }
44f54ceb 2586
bb270c08
DB
2587 ref += stride;
2588 height = (height >> 1) - 1;
44f54ceb 2589
bb270c08
DB
2590 do { /* 20 cycles */
2591 vis_ld64(ref[0], TMP0);
2592 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 2593
bb270c08
DB
2594 vis_ld64_2(ref, 8, TMP2);
2595 vis_and(TMP4, MASK_fe, TMP4);
2596 ref += stride;
44f54ceb 2597
bb270c08
DB
2598 vis_ld64(ref[0], TMP8);
2599 vis_and(REF_0, REF_2, TMP6);
2600 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2601
bb270c08 2602 vis_alignaddr_g0((void *)off);
44f54ceb 2603
bb270c08
DB
2604 vis_ld64_2(ref, 8, TMP10);
2605 ref += stride;
2606 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2607
bb270c08
DB
2608 if (off != 0x7) {
2609 vis_alignaddr_g0((void *)off_plus_1);
2610 vis_faligndata(TMP0, TMP2, REF_2);
2611 } else {
2612 vis_src1(TMP2, REF_2);
2613 }
44f54ceb 2614
bb270c08 2615 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2616
bb270c08
DB
2617 vis_padd16(TMP6, TMP4, DST_0);
2618 vis_st64(DST_0, dest[0]);
2619 dest += stride;
44f54ceb 2620
bb270c08 2621 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 2622
bb270c08 2623 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 2624
bb270c08
DB
2625 vis_and(REF_0, REF_2, TMP14);
2626 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 2627
bb270c08
DB
2628 vis_alignaddr_g0((void *)off);
2629 vis_faligndata(TMP8, TMP10, REF_0);
2630 if (off != 0x7) {
2631 vis_alignaddr_g0((void *)off_plus_1);
2632 vis_faligndata(TMP8, TMP10, REF_2);
2633 } else {
2634 vis_src1(TMP10, REF_2);
2635 }
44f54ceb 2636
bb270c08 2637 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 2638
bb270c08
DB
2639 vis_padd16(TMP14, TMP12, DST_0);
2640 vis_st64(DST_0, dest[0]);
2641 dest += stride;
2642 } while (--height);
44f54ceb 2643
bb270c08
DB
2644 vis_ld64(ref[0], TMP0);
2645 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 2646
bb270c08
DB
2647 vis_ld64_2(ref, 8, TMP2);
2648 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2649
bb270c08
DB
2650 vis_and(REF_0, REF_2, TMP6);
2651 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2652
bb270c08 2653 vis_alignaddr_g0((void *)off);
44f54ceb 2654
bb270c08 2655 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2656
bb270c08
DB
2657 if (off != 0x7) {
2658 vis_alignaddr_g0((void *)off_plus_1);
2659 vis_faligndata(TMP0, TMP2, REF_2);
2660 } else {
2661 vis_src1(TMP2, REF_2);
2662 }
44f54ceb 2663
bb270c08 2664 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2665
bb270c08
DB
2666 vis_padd16(TMP6, TMP4, DST_0);
2667 vis_st64(DST_0, dest[0]);
2668 dest += stride;
44f54ceb 2669
bb270c08 2670 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 2671
bb270c08 2672 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 2673
bb270c08
DB
2674 vis_and(REF_0, REF_2, TMP14);
2675 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 2676
bb270c08 2677 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 2678
bb270c08
DB
2679 vis_padd16(TMP14, TMP12, DST_0);
2680 vis_st64(DST_0, dest[0]);
2681 dest += stride;
44f54ceb
MN
2682}
2683
2684static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2685 const int stride, int height)
44f54ceb 2686{
bb270c08
DB
2687 uint8_t *ref = (uint8_t *) _ref;
2688 unsigned long off = (unsigned long) ref & 0x7;
2689 unsigned long off_plus_1 = off + 1;
44f54ceb 2690
bb270c08 2691 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 2692
bb270c08
DB
2693 vis_ld64(constants3[0], CONST_3);
2694 vis_fzero(ZERO);
2695 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 2696
bb270c08
DB
2697 ref = vis_alignaddr(ref);
2698 do { /* 26 cycles */
2699 vis_ld64(ref[0], TMP0);
44f54ceb 2700
bb270c08 2701 vis_ld64(ref[8], TMP2);
44f54ceb 2702
bb270c08 2703 vis_alignaddr_g0((void *)off);
44f54ceb 2704
bb270c08 2705 vis_ld64(ref[16], TMP4);
44f54ceb 2706
bb270c08
DB
2707 vis_ld64(dest[0], DST_0);
2708 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2709
bb270c08
DB
2710 vis_ld64(dest[8], DST_2);
2711 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2712
bb270c08
DB
2713 if (off != 0x7) {
2714 vis_alignaddr_g0((void *)off_plus_1);
2715 vis_faligndata(TMP0, TMP2, REF_2);
2716 vis_faligndata(TMP2, TMP4, REF_6);
2717 } else {
2718 vis_src1(TMP2, REF_2);
2719 vis_src1(TMP4, REF_6);
2720 }
44f54ceb 2721
bb270c08 2722 vis_mul8x16au(REF_0, CONST_256, TMP0);
44f54ceb 2723
bb270c08
DB
2724 vis_pmerge(ZERO, REF_2, TMP4);
2725 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 2726
bb270c08 2727 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 2728
bb270c08 2729 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 2730
bb270c08
DB
2731 vis_mul8x16al(DST_0, CONST_512, TMP4);
2732 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 2733
bb270c08 2734 vis_mul8x16al(DST_1, CONST_512, TMP6);
44f54ceb 2735
bb270c08 2736 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 2737
bb270c08
DB
2738 vis_padd16(TMP0, TMP4, TMP0);
2739 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 2740
bb270c08
DB
2741 vis_padd16(TMP2, TMP6, TMP2);
2742 vis_mul8x16au(REF_4, CONST_256, TMP16);
44f54ceb 2743
bb270c08
DB
2744 vis_padd16(TMP0, CONST_3, TMP8);
2745 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
44f54ceb 2746
bb270c08
DB
2747 vis_padd16(TMP2, CONST_3, TMP10);
2748 vis_pack16(TMP8, DST_0);
44f54ceb 2749
bb270c08
DB
2750 vis_pack16(TMP10, DST_1);
2751 vis_padd16(TMP16, TMP12, TMP0);
44f54ceb 2752
bb270c08
DB
2753 vis_st64(DST_0, dest[0]);
2754 vis_mul8x16al(DST_2, CONST_512, TMP4);
2755 vis_padd16(TMP18, TMP14, TMP2);
44f54ceb 2756
bb270c08
DB
2757 vis_mul8x16al(DST_3, CONST_512, TMP6);
2758 vis_padd16(TMP0, CONST_3, TMP0);
44f54ceb 2759
bb270c08 2760 vis_padd16(TMP2, CONST_3, TMP2);
44f54ceb 2761
bb270c08 2762 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 2763
bb270c08
DB
2764 vis_padd16(TMP2, TMP6, TMP2);
2765 vis_pack16(TMP0, DST_2);