Add more missing includes after removing the implicit common.h
[libav.git] / libavcodec / sparc / dsputil_vis.c
CommitLineData
44f54ceb 1/*
44f54ceb
MN
2 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
3 *
2912e87a 4 * This file is part of Libav.
44f54ceb 5 *
2912e87a 6 * Libav is free software; you can redistribute it and/or
a33fe572
DB
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
44f54ceb 10 *
2912e87a 11 * Libav is distributed in the hope that it will be useful,
44f54ceb 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
a33fe572
DB
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
44f54ceb 15 *
a33fe572 16 * You should have received a copy of the GNU Lesser General Public
2912e87a 17 * License along with Libav; if not, write to the Free Software
5509bffa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
44f54ceb
MN
19 */
20
0f12310f 21/* The *no_round* functions have been added by James A. Morrison, 2003,2004.
2f5df0b1 22 The vis code from libmpeg2 was adapted for libavcodec by James A. Morrison.
44f54ceb
MN
23 */
24
25#include "config.h"
26
44f54ceb
MN
27#include <inttypes.h>
28
245976da 29#include "libavcodec/dsputil.h"
33e11284 30#include "libavutil/mem.h"
ad403802 31#include "dsputil_vis.h"
44f54ceb
MN
32
33#include "vis.h"
34
35/* The trick used in some of this file is the formula from the MMX
36 * motion comp code, which is:
37 *
38 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
39 *
40 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
41 * We avoid overflows by masking before we do the shift, and we
42 * implement the shift by multiplying by 1/2 using mul8x16. So in
43 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
44 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
45 * the value 0x80808080 is in f8):
46 *
bb270c08
DB
47 * fxor f0, f2, f10
48 * fand f10, f4, f10
49 * fmul8x16 f8, f10, f10
50 * fand f10, f6, f10
51 * for f0, f2, f12
52 * fpsub16 f12, f10, f10
44f54ceb
MN
53 */
54
44f54ceb
MN
55#define DUP4(x) {x, x, x, x}
56#define DUP8(x) {x, x, x, x, x, x, x, x}
d343d598
MR
57DECLARE_ALIGNED(8, static const int16_t, constants1)[] = DUP4 (1);
58DECLARE_ALIGNED(8, static const int16_t, constants2)[] = DUP4 (2);
59DECLARE_ALIGNED(8, static const int16_t, constants3)[] = DUP4 (3);
60DECLARE_ALIGNED(8, static const int16_t, constants6)[] = DUP4 (6);
61DECLARE_ALIGNED(8, static const int8_t, constants_fe)[] = DUP8 (0xfe);
62DECLARE_ALIGNED(8, static const int8_t, constants_7f)[] = DUP8 (0x7f);
63DECLARE_ALIGNED(8, static const int8_t, constants128)[] = DUP8 (128);
64DECLARE_ALIGNED(8, static const int16_t, constants256_512)[] =
bb270c08 65 {256, 512, 256, 512};
d343d598 66DECLARE_ALIGNED(8, static const int16_t, constants256_1024)[] =
bb270c08
DB
67 {256, 1024, 256, 1024};
68
69#define REF_0 0
70#define REF_0_1 1
71#define REF_2 2
72#define REF_2_1 3
73#define REF_4 4
74#define REF_4_1 5
75#define REF_6 6
76#define REF_6_1 7
77#define REF_S0 8
78#define REF_S0_1 9
79#define REF_S2 10
80#define REF_S2_1 11
81#define REF_S4 12
82#define REF_S4_1 13
83#define REF_S6 14
84#define REF_S6_1 15
85#define DST_0 16
86#define DST_1 17
87#define DST_2 18
88#define DST_3 19
89#define CONST_1 20
90#define CONST_2 20
91#define CONST_3 20
92#define CONST_6 20
93#define MASK_fe 20
94#define CONST_128 22
95#define CONST_256 22
96#define CONST_512 22
97#define CONST_1024 22
98#define TMP0 24
99#define TMP1 25
100#define TMP2 26
101#define TMP3 27
102#define TMP4 28
103#define TMP5 29
104#define ZERO 30
105#define MASK_7f 30
106
107#define TMP6 32
108#define TMP8 34
109#define TMP10 36
110#define TMP12 38
111#define TMP14 40
112#define TMP16 42
113#define TMP18 44
114#define TMP20 46
115#define TMP22 48
116#define TMP24 50
117#define TMP26 52
118#define TMP28 54
119#define TMP30 56
120#define TMP32 58
44f54ceb 121
86decad6 122static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 123 const int stride, int height)
44f54ceb 124{
bb270c08
DB
125 ref = vis_alignaddr(ref);
126 do { /* 5 cycles */
127 vis_ld64(ref[0], TMP0);
44f54ceb 128
bb270c08 129 vis_ld64_2(ref, 8, TMP2);
44f54ceb 130
bb270c08
DB
131 vis_ld64_2(ref, 16, TMP4);
132 ref += stride;
44f54ceb 133
bb270c08
DB
134 vis_faligndata(TMP0, TMP2, REF_0);
135 vis_st64(REF_0, dest[0]);
44f54ceb 136
bb270c08
DB
137 vis_faligndata(TMP2, TMP4, REF_2);
138 vis_st64_2(REF_2, dest, 8);
139 dest += stride;
140 } while (--height);
44f54ceb
MN
141}
142
86decad6 143static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 144 const int stride, int height)
44f54ceb 145{
bb270c08
DB
146 ref = vis_alignaddr(ref);
147 do { /* 4 cycles */
148 vis_ld64(ref[0], TMP0);
44f54ceb 149
bb270c08
DB
150 vis_ld64(ref[8], TMP2);
151 ref += stride;
44f54ceb 152
bb270c08 153 /* stall */
44f54ceb 154
bb270c08
DB
155 vis_faligndata(TMP0, TMP2, REF_0);
156 vis_st64(REF_0, dest[0]);
157 dest += stride;
158 } while (--height);
44f54ceb
MN
159}
160
161
86decad6 162static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 163 const int stride, int height)
44f54ceb 164{
bb270c08 165 int stride_8 = stride + 8;
44f54ceb 166
bb270c08 167 ref = vis_alignaddr(ref);
44f54ceb 168
bb270c08 169 vis_ld64(ref[0], TMP0);
44f54ceb 170
bb270c08 171 vis_ld64(ref[8], TMP2);
44f54ceb 172
bb270c08 173 vis_ld64(ref[16], TMP4);
44f54ceb 174
bb270c08 175 vis_ld64(dest[0], DST_0);
44f54ceb 176
bb270c08 177 vis_ld64(dest[8], DST_2);
44f54ceb 178
bb270c08
DB
179 vis_ld64(constants_fe[0], MASK_fe);
180 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 181
bb270c08
DB
182 vis_ld64(constants_7f[0], MASK_7f);
183 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 184
bb270c08 185 vis_ld64(constants128[0], CONST_128);
44f54ceb 186
bb270c08
DB
187 ref += stride;
188 height = (height >> 1) - 1;
44f54ceb 189
bb270c08
DB
190 do { /* 24 cycles */
191 vis_ld64(ref[0], TMP0);
192 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 193
bb270c08
DB
194 vis_ld64_2(ref, 8, TMP2);
195 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 196
bb270c08
DB
197 vis_ld64_2(ref, 16, TMP4);
198 ref += stride;
199 vis_mul8x16(CONST_128, TMP6, TMP6);
200 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 201
bb270c08 202 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 203
bb270c08
DB
204 vis_or(DST_0, REF_0, TMP10);
205 vis_ld64_2(dest, stride, DST_0);
206 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 207
bb270c08
DB
208 vis_or(DST_2, REF_2, TMP12);
209 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 210
bb270c08
DB
211 vis_ld64(ref[0], TMP14);
212 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 213
bb270c08 214 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 215
bb270c08
DB
216 vis_psub16(TMP10, TMP6, TMP6);
217 vis_st64(TMP6, dest[0]);
44f54ceb 218
bb270c08
DB
219 vis_psub16(TMP12, TMP8, TMP8);
220 vis_st64_2(TMP8, dest, 8);
44f54ceb 221
bb270c08
DB
222 dest += stride;
223 vis_ld64_2(ref, 8, TMP16);
224 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 225
bb270c08
DB
226 vis_ld64_2(ref, 16, TMP18);
227 vis_faligndata(TMP2, TMP4, REF_2);
228 ref += stride;
44f54ceb 229
bb270c08 230 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 231
bb270c08 232 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 233
bb270c08
DB
234 vis_xor(DST_2, REF_2, TMP22);
235 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 236
bb270c08 237 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 238
bb270c08
DB
239 vis_or(DST_0, REF_0, TMP24);
240 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 241
bb270c08 242 vis_or(DST_2, REF_2, TMP26);
44f54ceb 243
bb270c08
DB
244 vis_ld64_2(dest, stride, DST_0);
245 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 246
bb270c08
DB
247 vis_ld64_2(dest, stride_8, DST_2);
248 vis_faligndata(TMP16, TMP18, REF_2);
44f54ceb 249
bb270c08 250 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 251
bb270c08 252 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 253
bb270c08
DB
254 vis_psub16(TMP24, TMP20, TMP20);
255 vis_st64(TMP20, dest[0]);
44f54ceb 256
bb270c08
DB
257 vis_psub16(TMP26, TMP22, TMP22);
258 vis_st64_2(TMP22, dest, 8);
259 dest += stride;
260 } while (--height);
44f54ceb 261
bb270c08
DB
262 vis_ld64(ref[0], TMP0);
263 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 264
bb270c08
DB
265 vis_ld64_2(ref, 8, TMP2);
266 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 267
bb270c08
DB
268 vis_ld64_2(ref, 16, TMP4);
269 vis_mul8x16(CONST_128, TMP6, TMP6);
270 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 271
bb270c08 272 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 273
bb270c08
DB
274 vis_or(DST_0, REF_0, TMP10);
275 vis_ld64_2(dest, stride, DST_0);
276 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 277
bb270c08
DB
278 vis_or(DST_2, REF_2, TMP12);
279 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 280
bb270c08
DB
281 vis_ld64(ref[0], TMP14);
282 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 283
bb270c08 284 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 285
bb270c08
DB
286 vis_psub16(TMP10, TMP6, TMP6);
287 vis_st64(TMP6, dest[0]);
44f54ceb 288
bb270c08
DB
289 vis_psub16(TMP12, TMP8, TMP8);
290 vis_st64_2(TMP8, dest, 8);
44f54ceb 291
bb270c08
DB
292 dest += stride;
293 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 294
bb270c08 295 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 296
bb270c08 297 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 298
bb270c08 299 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 300
bb270c08
DB
301 vis_xor(DST_2, REF_2, TMP22);
302 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 303
bb270c08 304 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 305
bb270c08
DB
306 vis_or(DST_0, REF_0, TMP24);
307 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 308
bb270c08 309 vis_or(DST_2, REF_2, TMP26);
44f54ceb 310
bb270c08 311 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 312
bb270c08 313 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 314
bb270c08
DB
315 vis_psub16(TMP24, TMP20, TMP20);
316 vis_st64(TMP20, dest[0]);
44f54ceb 317
bb270c08
DB
318 vis_psub16(TMP26, TMP22, TMP22);
319 vis_st64_2(TMP22, dest, 8);
44f54ceb
MN
320}
321
86decad6 322static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 323 const int stride, int height)
44f54ceb 324{
bb270c08 325 ref = vis_alignaddr(ref);
44f54ceb 326
bb270c08 327 vis_ld64(ref[0], TMP0);
44f54ceb 328
bb270c08 329 vis_ld64(ref[8], TMP2);
44f54ceb 330
bb270c08 331 vis_ld64(dest[0], DST_0);
44f54ceb 332
bb270c08 333 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 334
bb270c08
DB
335 vis_ld64(constants_7f[0], MASK_7f);
336 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 337
bb270c08 338 vis_ld64(constants128[0], CONST_128);
44f54ceb 339
bb270c08
DB
340 ref += stride;
341 height = (height >> 1) - 1;
44f54ceb 342
bb270c08
DB
343 do { /* 12 cycles */
344 vis_ld64(ref[0], TMP0);
345 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 346
bb270c08
DB
347 vis_ld64(ref[8], TMP2);
348 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 349
bb270c08
DB
350 vis_or(DST_0, REF_0, TMP6);
351 vis_ld64_2(dest, stride, DST_0);
352 ref += stride;
353 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 354
bb270c08
DB
355 vis_ld64(ref[0], TMP12);
356 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 357
bb270c08
DB
358 vis_ld64(ref[8], TMP2);
359 vis_xor(DST_0, REF_0, TMP0);
360 ref += stride;
44f54ceb 361
bb270c08 362 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 363
bb270c08 364 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 365
bb270c08
DB
366 vis_psub16(TMP6, TMP4, TMP4);
367 vis_st64(TMP4, dest[0]);
368 dest += stride;
369 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 370
bb270c08
DB
371 vis_or(DST_0, REF_0, TMP6);
372 vis_ld64_2(dest, stride, DST_0);
44f54ceb 373
bb270c08 374 vis_faligndata(TMP12, TMP2, REF_0);
44f54ceb 375
bb270c08 376 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 377
bb270c08
DB
378 vis_psub16(TMP6, TMP0, TMP4);
379 vis_st64(TMP4, dest[0]);
380 dest += stride;
381 } while (--height);
44f54ceb 382
bb270c08
DB
383 vis_ld64(ref[0], TMP0);
384 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 385
bb270c08
DB
386 vis_ld64(ref[8], TMP2);
387 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 388
bb270c08
DB
389 vis_or(DST_0, REF_0, TMP6);
390 vis_ld64_2(dest, stride, DST_0);
391 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 392
bb270c08 393 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 394
bb270c08 395 vis_xor(DST_0, REF_0, TMP0);
44f54ceb 396
bb270c08 397 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 398
bb270c08 399 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 400
bb270c08
DB
401 vis_psub16(TMP6, TMP4, TMP4);
402 vis_st64(TMP4, dest[0]);
403 dest += stride;
404 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 405
bb270c08 406 vis_or(DST_0, REF_0, TMP6);
44f54ceb 407
bb270c08 408 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 409
bb270c08
DB
410 vis_psub16(TMP6, TMP0, TMP4);
411 vis_st64(TMP4, dest[0]);
44f54ceb
MN
412}
413
86decad6 414static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 415 const int stride, int height)
44f54ceb 416{
bb270c08
DB
417 unsigned long off = (unsigned long) ref & 0x7;
418 unsigned long off_plus_1 = off + 1;
44f54ceb 419
bb270c08 420 ref = vis_alignaddr(ref);
44f54ceb 421
bb270c08 422 vis_ld64(ref[0], TMP0);
44f54ceb 423
bb270c08 424 vis_ld64_2(ref, 8, TMP2);
44f54ceb 425
bb270c08 426 vis_ld64_2(ref, 16, TMP4);
44f54ceb 427
bb270c08 428 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 429
bb270c08
DB
430 vis_ld64(constants_7f[0], MASK_7f);
431 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 432
bb270c08
DB
433 vis_ld64(constants128[0], CONST_128);
434 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 435
bb270c08
DB
436 if (off != 0x7) {
437 vis_alignaddr_g0((void *)off_plus_1);
438 vis_faligndata(TMP0, TMP2, REF_2);
439 vis_faligndata(TMP2, TMP4, REF_6);
440 } else {
441 vis_src1(TMP2, REF_2);
442 vis_src1(TMP4, REF_6);
443 }
44f54ceb 444
bb270c08
DB
445 ref += stride;
446 height = (height >> 1) - 1;
44f54ceb 447
bb270c08
DB
448 do { /* 34 cycles */
449 vis_ld64(ref[0], TMP0);
450 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 451
bb270c08
DB
452 vis_ld64_2(ref, 8, TMP2);
453 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 454
bb270c08
DB
455 vis_ld64_2(ref, 16, TMP4);
456 vis_and(TMP6, MASK_fe, TMP6);
457 ref += stride;
44f54ceb 458
bb270c08
DB
459 vis_ld64(ref[0], TMP14);
460 vis_mul8x16(CONST_128, TMP6, TMP6);
461 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 462
bb270c08
DB
463 vis_ld64_2(ref, 8, TMP16);
464 vis_mul8x16(CONST_128, TMP8, TMP8);
465 vis_or(REF_0, REF_2, TMP10);
44f54ceb 466
bb270c08
DB
467 vis_ld64_2(ref, 16, TMP18);
468 ref += stride;
469 vis_or(REF_4, REF_6, TMP12);
44f54ceb 470
bb270c08 471 vis_alignaddr_g0((void *)off);
44f54ceb 472
bb270c08 473 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 474
bb270c08 475 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 476
bb270c08
DB
477 if (off != 0x7) {
478 vis_alignaddr_g0((void *)off_plus_1);
479 vis_faligndata(TMP0, TMP2, REF_2);
480 vis_faligndata(TMP2, TMP4, REF_6);
481 } else {
482 vis_src1(TMP2, REF_2);
483 vis_src1(TMP4, REF_6);
484 }
44f54ceb 485
bb270c08 486 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 487
bb270c08 488 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 489
bb270c08
DB
490 vis_psub16(TMP10, TMP6, TMP6);
491 vis_st64(TMP6, dest[0]);
44f54ceb 492
bb270c08
DB
493 vis_psub16(TMP12, TMP8, TMP8);
494 vis_st64_2(TMP8, dest, 8);
495 dest += stride;
44f54ceb 496
bb270c08 497 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 498
bb270c08 499 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 500
bb270c08 501 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 502
bb270c08
DB
503 vis_mul8x16(CONST_128, TMP6, TMP6);
504 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 505
bb270c08
DB
506 vis_mul8x16(CONST_128, TMP8, TMP8);
507 vis_or(REF_0, REF_2, TMP10);
44f54ceb 508
bb270c08 509 vis_or(REF_4, REF_6, TMP12);
44f54ceb 510
bb270c08 511 vis_alignaddr_g0((void *)off);
44f54ceb 512
bb270c08 513 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 514
bb270c08 515 vis_faligndata(TMP16, TMP18, REF_4);
44f54ceb 516
bb270c08
DB
517 if (off != 0x7) {
518 vis_alignaddr_g0((void *)off_plus_1);
519 vis_faligndata(TMP14, TMP16, REF_2);
520 vis_faligndata(TMP16, TMP18, REF_6);
521 } else {
522 vis_src1(TMP16, REF_2);
523 vis_src1(TMP18, REF_6);
524 }
44f54ceb 525
bb270c08 526 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 527
bb270c08 528 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 529
bb270c08
DB
530 vis_psub16(TMP10, TMP6, TMP6);
531 vis_st64(TMP6, dest[0]);
44f54ceb 532
bb270c08
DB
533 vis_psub16(TMP12, TMP8, TMP8);
534 vis_st64_2(TMP8, dest, 8);
535 dest += stride;
536 } while (--height);
44f54ceb 537
bb270c08
DB
538 vis_ld64(ref[0], TMP0);
539 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 540
bb270c08
DB
541 vis_ld64_2(ref, 8, TMP2);
542 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 543
bb270c08
DB
544 vis_ld64_2(ref, 16, TMP4);
545 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 546
bb270c08
DB
547 vis_mul8x16(CONST_128, TMP6, TMP6);
548 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 549
bb270c08
DB
550 vis_mul8x16(CONST_128, TMP8, TMP8);
551 vis_or(REF_0, REF_2, TMP10);
44f54ceb 552
bb270c08 553 vis_or(REF_4, REF_6, TMP12);
44f54ceb 554
bb270c08 555 vis_alignaddr_g0((void *)off);
44f54ceb 556
bb270c08 557 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 558
bb270c08 559 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 560
bb270c08
DB
561 if (off != 0x7) {
562 vis_alignaddr_g0((void *)off_plus_1);
563 vis_faligndata(TMP0, TMP2, REF_2);
564 vis_faligndata(TMP2, TMP4, REF_6);
565 } else {
566 vis_src1(TMP2, REF_2);
567 vis_src1(TMP4, REF_6);
568 }
44f54ceb 569
bb270c08 570 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 571
bb270c08 572 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 573
bb270c08
DB
574 vis_psub16(TMP10, TMP6, TMP6);
575 vis_st64(TMP6, dest[0]);
44f54ceb 576
bb270c08
DB
577 vis_psub16(TMP12, TMP8, TMP8);
578 vis_st64_2(TMP8, dest, 8);
579 dest += stride;
44f54ceb 580
bb270c08 581 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 582
bb270c08 583 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 584
bb270c08 585 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 586
bb270c08
DB
587 vis_mul8x16(CONST_128, TMP6, TMP6);
588 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 589
bb270c08
DB
590 vis_mul8x16(CONST_128, TMP8, TMP8);
591 vis_or(REF_0, REF_2, TMP10);
44f54ceb 592
bb270c08 593 vis_or(REF_4, REF_6, TMP12);
44f54ceb 594
bb270c08 595 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 596
bb270c08 597 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 598
bb270c08
DB
599 vis_psub16(TMP10, TMP6, TMP6);
600 vis_st64(TMP6, dest[0]);
44f54ceb 601
bb270c08
DB
602 vis_psub16(TMP12, TMP8, TMP8);
603 vis_st64_2(TMP8, dest, 8);
44f54ceb
MN
604}
605
86decad6 606static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 607 const int stride, int height)
44f54ceb 608{
bb270c08
DB
609 unsigned long off = (unsigned long) ref & 0x7;
610 unsigned long off_plus_1 = off + 1;
44f54ceb 611
bb270c08 612 ref = vis_alignaddr(ref);
44f54ceb 613
bb270c08 614 vis_ld64(ref[0], TMP0);
44f54ceb 615
bb270c08 616 vis_ld64(ref[8], TMP2);
44f54ceb 617
bb270c08 618 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 619
bb270c08 620 vis_ld64(constants_7f[0], MASK_7f);
44f54ceb 621
bb270c08
DB
622 vis_ld64(constants128[0], CONST_128);
623 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 624
bb270c08
DB
625 if (off != 0x7) {
626 vis_alignaddr_g0((void *)off_plus_1);
627 vis_faligndata(TMP0, TMP2, REF_2);
628 } else {
629 vis_src1(TMP2, REF_2);
630 }
44f54ceb 631
bb270c08
DB
632 ref += stride;
633 height = (height >> 1) - 1;
44f54ceb 634
bb270c08
DB
635 do { /* 20 cycles */
636 vis_ld64(ref[0], TMP0);
637 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 638
bb270c08
DB
639 vis_ld64_2(ref, 8, TMP2);
640 vis_and(TMP4, MASK_fe, TMP4);
641 ref += stride;
44f54ceb 642
bb270c08
DB
643 vis_ld64(ref[0], TMP8);
644 vis_or(REF_0, REF_2, TMP6);
645 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 646
bb270c08 647 vis_alignaddr_g0((void *)off);
44f54ceb 648
bb270c08
DB
649 vis_ld64_2(ref, 8, TMP10);
650 ref += stride;
651 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 652
bb270c08
DB
653 if (off != 0x7) {
654 vis_alignaddr_g0((void *)off_plus_1);
655 vis_faligndata(TMP0, TMP2, REF_2);
656 } else {
657 vis_src1(TMP2, REF_2);
658 }
44f54ceb 659
bb270c08 660 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 661
bb270c08
DB
662 vis_psub16(TMP6, TMP4, DST_0);
663 vis_st64(DST_0, dest[0]);
664 dest += stride;
44f54ceb 665
bb270c08 666 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 667
bb270c08 668 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 669
bb270c08
DB
670 vis_or(REF_0, REF_2, TMP14);
671 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 672
bb270c08
DB
673 vis_alignaddr_g0((void *)off);
674 vis_faligndata(TMP8, TMP10, REF_0);
675 if (off != 0x7) {
676 vis_alignaddr_g0((void *)off_plus_1);
677 vis_faligndata(TMP8, TMP10, REF_2);
678 } else {
679 vis_src1(TMP10, REF_2);
680 }
44f54ceb 681
bb270c08 682 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 683
bb270c08
DB
684 vis_psub16(TMP14, TMP12, DST_0);
685 vis_st64(DST_0, dest[0]);
686 dest += stride;
687 } while (--height);
44f54ceb 688
bb270c08
DB
689 vis_ld64(ref[0], TMP0);
690 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 691
bb270c08
DB
692 vis_ld64_2(ref, 8, TMP2);
693 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 694
bb270c08
DB
695 vis_or(REF_0, REF_2, TMP6);
696 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 697
bb270c08 698 vis_alignaddr_g0((void *)off);
44f54ceb 699
bb270c08 700 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 701
bb270c08
DB
702 if (off != 0x7) {
703 vis_alignaddr_g0((void *)off_plus_1);
704 vis_faligndata(TMP0, TMP2, REF_2);
705 } else {
706 vis_src1(TMP2, REF_2);
707 }
44f54ceb 708
bb270c08 709 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 710
bb270c08
DB
711 vis_psub16(TMP6, TMP4, DST_0);
712 vis_st64(DST_0, dest[0]);
713 dest += stride;
44f54ceb 714
bb270c08 715 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 716
bb270c08 717 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 718
bb270c08
DB
719 vis_or(REF_0, REF_2, TMP14);
720 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 721
bb270c08 722 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 723
bb270c08
DB
724 vis_psub16(TMP14, TMP12, DST_0);
725 vis_st64(DST_0, dest[0]);
726 dest += stride;
44f54ceb
MN
727}
728
86decad6 729static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 730 const int stride, int height)
44f54ceb 731{
bb270c08
DB
732 unsigned long off = (unsigned long) ref & 0x7;
733 unsigned long off_plus_1 = off + 1;
44f54ceb 734
bb270c08 735 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 736
bb270c08
DB
737 vis_ld64(constants3[0], CONST_3);
738 vis_fzero(ZERO);
739 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 740
bb270c08
DB
741 ref = vis_alignaddr(ref);
742 do { /* 26 cycles */
743 vis_ld64(ref[0], TMP0);
44f54ceb 744
bb270c08 745 vis_ld64(ref[8], TMP2);
44f54ceb 746
bb270c08 747 vis_alignaddr_g0((void *)off);
44f54ceb 748
bb270c08 749 vis_ld64(ref[16], TMP4);
44f54ceb 750
bb270c08
DB
751 vis_ld64(dest[0], DST_0);
752 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 753
bb270c08
DB
754 vis_ld64(dest[8], DST_2);
755 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 756
bb270c08
DB
757 if (off != 0x7) {
758 vis_alignaddr_g0((void *)off_plus_1);
759 vis_faligndata(TMP0, TMP2, REF_2);
760 vis_faligndata(TMP2, TMP4, REF_6);
761 } else {
762 vis_src1(TMP2, REF_2);
763 vis_src1(TMP4, REF_6);
764 }
44f54ceb 765
bb270c08 766 vis_mul8x16au(REF_0, CONST_256, TMP0);
44f54ceb 767
bb270c08
DB
768 vis_pmerge(ZERO, REF_2, TMP4);
769 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 770
bb270c08 771 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 772
bb270c08 773 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 774
bb270c08
DB
775 vis_mul8x16al(DST_0, CONST_512, TMP4);
776 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 777
bb270c08 778 vis_mul8x16al(DST_1, CONST_512, TMP6);
44f54ceb 779
bb270c08 780 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 781
bb270c08
DB
782 vis_padd16(TMP0, TMP4, TMP0);
783 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 784
bb270c08
DB
785 vis_padd16(TMP2, TMP6, TMP2);
786 vis_mul8x16au(REF_4, CONST_256, TMP16);
44f54ceb 787
bb270c08
DB
788 vis_padd16(TMP0, CONST_3, TMP8);
789 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
44f54ceb 790
bb270c08
DB
791 vis_padd16(TMP2, CONST_3, TMP10);
792 vis_pack16(TMP8, DST_0);
44f54ceb 793
bb270c08
DB
794 vis_pack16(TMP10, DST_1);
795 vis_padd16(TMP16, TMP12, TMP0);
44f54ceb 796
bb270c08
DB
797 vis_st64(DST_0, dest[0]);
798 vis_mul8x16al(DST_2, CONST_512, TMP4);
799 vis_padd16(TMP18, TMP14, TMP2);
44f54ceb 800
bb270c08
DB
801 vis_mul8x16al(DST_3, CONST_512, TMP6);
802 vis_padd16(TMP0, CONST_3, TMP0);
44f54ceb 803
bb270c08 804 vis_padd16(TMP2, CONST_3, TMP2);
44f54ceb 805
bb270c08 806 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 807
bb270c08
DB
808 vis_padd16(TMP2, TMP6, TMP2);
809 vis_pack16(TMP0, DST_2);
44f54ceb 810
bb270c08
DB
811 vis_pack16(TMP2, DST_3);
812 vis_st64(DST_2, dest[8]);
44f54ceb 813
bb270c08
DB
814 ref += stride;
815 dest += stride;
816 } while (--height);
44f54ceb
MN
817}
818
86decad6 819static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 820 const int stride, int height)
44f54ceb 821{
bb270c08
DB
822 unsigned long off = (unsigned long) ref & 0x7;
823 unsigned long off_plus_1 = off + 1;
824 int stride_times_2 = stride << 1;
44f54ceb 825
bb270c08 826 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 827
bb270c08
DB
828 vis_ld64(constants3[0], CONST_3);
829 vis_fzero(ZERO);
830 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 831
bb270c08
DB
832 ref = vis_alignaddr(ref);
833 height >>= 2;
834 do { /* 47 cycles */
835 vis_ld64(ref[0], TMP0);
44f54ceb 836
bb270c08
DB
837 vis_ld64_2(ref, 8, TMP2);
838 ref += stride;
44f54ceb 839
bb270c08 840 vis_alignaddr_g0((void *)off);
44f54ceb 841
bb270c08
DB
842 vis_ld64(ref[0], TMP4);
843 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 844
bb270c08
DB
845 vis_ld64_2(ref, 8, TMP6);
846 ref += stride;
44f54ceb 847
bb270c08 848 vis_ld64(ref[0], TMP8);
44f54ceb 849
bb270c08
DB
850 vis_ld64_2(ref, 8, TMP10);
851 ref += stride;
852 vis_faligndata(TMP4, TMP6, REF_4);
44f54ceb 853
bb270c08 854 vis_ld64(ref[0], TMP12);
44f54ceb 855
bb270c08
DB
856 vis_ld64_2(ref, 8, TMP14);
857 ref += stride;
858 vis_faligndata(TMP8, TMP10, REF_S0);
44f54ceb 859
bb270c08 860 vis_faligndata(TMP12, TMP14, REF_S4);
44f54ceb 861
bb270c08
DB
862 if (off != 0x7) {
863 vis_alignaddr_g0((void *)off_plus_1);
44f54ceb 864
bb270c08
DB
865 vis_ld64(dest[0], DST_0);
866 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 867
bb270c08
DB
868 vis_ld64_2(dest, stride, DST_2);
869 vis_faligndata(TMP4, TMP6, REF_6);
44f54ceb 870
bb270c08 871 vis_faligndata(TMP8, TMP10, REF_S2);
44f54ceb 872
bb270c08
DB
873 vis_faligndata(TMP12, TMP14, REF_S6);
874 } else {
875 vis_ld64(dest[0], DST_0);
876 vis_src1(TMP2, REF_2);
44f54ceb 877
bb270c08
DB
878 vis_ld64_2(dest, stride, DST_2);
879 vis_src1(TMP6, REF_6);
44f54ceb 880
bb270c08 881 vis_src1(TMP10, REF_S2);
44f54ceb 882
bb270c08
DB
883 vis_src1(TMP14, REF_S6);
884 }
44f54ceb 885
bb270c08
DB
886 vis_pmerge(ZERO, REF_0, TMP0);
887 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 888
bb270c08
DB
889 vis_pmerge(ZERO, REF_2, TMP4);
890 vis_mul8x16au(REF_2_1, CONST_256, TMP6);
44f54ceb 891
bb270c08
DB
892 vis_padd16(TMP0, CONST_3, TMP0);
893 vis_mul8x16al(DST_0, CONST_512, TMP16);
44f54ceb 894
bb270c08
DB
895 vis_padd16(TMP2, CONST_3, TMP2);
896 vis_mul8x16al(DST_1, CONST_512, TMP18);
44f54ceb 897
bb270c08
DB
898 vis_padd16(TMP0, TMP4, TMP0);
899 vis_mul8x16au(REF_4, CONST_256, TMP8);
44f54ceb 900
bb270c08
DB
901 vis_padd16(TMP2, TMP6, TMP2);
902 vis_mul8x16au(REF_4_1, CONST_256, TMP10);
44f54ceb 903
bb270c08
DB
904 vis_padd16(TMP0, TMP16, TMP0);
905 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 906
bb270c08
DB
907 vis_padd16(TMP2, TMP18, TMP2);
908 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 909
bb270c08
DB
910 vis_padd16(TMP8, CONST_3, TMP8);
911 vis_mul8x16al(DST_2, CONST_512, TMP16);
44f54ceb 912
bb270c08
DB
913 vis_padd16(TMP8, TMP12, TMP8);
914 vis_mul8x16al(DST_3, CONST_512, TMP18);
44f54ceb 915
bb270c08
DB
916 vis_padd16(TMP10, TMP14, TMP10);
917 vis_pack16(TMP0, DST_0);
44f54ceb 918
bb270c08
DB
919 vis_pack16(TMP2, DST_1);
920 vis_st64(DST_0, dest[0]);
921 dest += stride;
922 vis_padd16(TMP10, CONST_3, TMP10);
44f54ceb 923
bb270c08
DB
924 vis_ld64_2(dest, stride, DST_0);
925 vis_padd16(TMP8, TMP16, TMP8);
44f54ceb 926
bb270c08
DB
927 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
928 vis_padd16(TMP10, TMP18, TMP10);
929 vis_pack16(TMP8, DST_2);
44f54ceb 930
bb270c08
DB
931 vis_pack16(TMP10, DST_3);
932 vis_st64(DST_2, dest[0]);
933 dest += stride;
44f54ceb 934
bb270c08
DB
935 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
936 vis_pmerge(ZERO, REF_S0, TMP0);
44f54ceb 937
bb270c08
DB
938 vis_pmerge(ZERO, REF_S2, TMP24);
939 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
44f54ceb 940
bb270c08
DB
941 vis_padd16(TMP0, CONST_3, TMP0);
942 vis_mul8x16au(REF_S4, CONST_256, TMP8);
44f54ceb 943
bb270c08
DB
944 vis_padd16(TMP2, CONST_3, TMP2);
945 vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
44f54ceb 946
bb270c08
DB
947 vis_padd16(TMP0, TMP24, TMP0);
948 vis_mul8x16au(REF_S6, CONST_256, TMP12);
44f54ceb 949
bb270c08
DB
950 vis_padd16(TMP2, TMP6, TMP2);
951 vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
44f54ceb 952
bb270c08
DB
953 vis_padd16(TMP8, CONST_3, TMP8);
954 vis_mul8x16al(DST_0, CONST_512, TMP16);
44f54ceb 955
bb270c08
DB
956 vis_padd16(TMP10, CONST_3, TMP10);
957 vis_mul8x16al(DST_1, CONST_512, TMP18);
44f54ceb 958
bb270c08
DB
959 vis_padd16(TMP8, TMP12, TMP8);
960 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
44f54ceb 961
bb270c08
DB
962 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
963 vis_padd16(TMP0, TMP16, TMP0);
44f54ceb 964
bb270c08
DB
965 vis_padd16(TMP2, TMP18, TMP2);
966 vis_pack16(TMP0, DST_0);
44f54ceb 967
bb270c08
DB
968 vis_padd16(TMP10, TMP14, TMP10);
969 vis_pack16(TMP2, DST_1);
970 vis_st64(DST_0, dest[0]);
971 dest += stride;
44f54ceb 972
bb270c08 973 vis_padd16(TMP8, TMP20, TMP8);
44f54ceb 974
bb270c08
DB
975 vis_padd16(TMP10, TMP22, TMP10);
976 vis_pack16(TMP8, DST_2);
44f54ceb 977
bb270c08
DB
978 vis_pack16(TMP10, DST_3);
979 vis_st64(DST_2, dest[0]);
980 dest += stride;
981 } while (--height);
44f54ceb
MN
982}
983
86decad6 984static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 985 const int stride, int height)
44f54ceb 986{
bb270c08
DB
987 ref = vis_alignaddr(ref);
988 vis_ld64(ref[0], TMP0);
44f54ceb 989
bb270c08 990 vis_ld64_2(ref, 8, TMP2);
44f54ceb 991
bb270c08
DB
992 vis_ld64_2(ref, 16, TMP4);
993 ref += stride;
44f54ceb 994
bb270c08
DB
995 vis_ld64(ref[0], TMP6);
996 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 997
bb270c08
DB
998 vis_ld64_2(ref, 8, TMP8);
999 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1000
bb270c08
DB
1001 vis_ld64_2(ref, 16, TMP10);
1002 ref += stride;
44f54ceb 1003
bb270c08
DB
1004 vis_ld64(constants_fe[0], MASK_fe);
1005 vis_faligndata(TMP6, TMP8, REF_2);
44f54ceb 1006
bb270c08
DB
1007 vis_ld64(constants_7f[0], MASK_7f);
1008 vis_faligndata(TMP8, TMP10, REF_6);
44f54ceb 1009
bb270c08
DB
1010 vis_ld64(constants128[0], CONST_128);
1011 height = (height >> 1) - 1;
1012 do { /* 24 cycles */
1013 vis_ld64(ref[0], TMP0);
1014 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1015
bb270c08
DB
1016 vis_ld64_2(ref, 8, TMP2);
1017 vis_xor(REF_4, REF_6, TMP16);
44f54ceb 1018
bb270c08
DB
1019 vis_ld64_2(ref, 16, TMP4);
1020 ref += stride;
1021 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1022
bb270c08
DB
1023 vis_ld64(ref[0], TMP6);
1024 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1025
bb270c08
DB
1026 vis_ld64_2(ref, 8, TMP8);
1027 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1028
bb270c08
DB
1029 vis_ld64_2(ref, 16, TMP10);
1030 ref += stride;
1031 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1032
bb270c08 1033 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1034
bb270c08
DB
1035 vis_and(TMP16, MASK_fe, TMP16);
1036 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 1037
bb270c08
DB
1038 vis_mul8x16(CONST_128, TMP16, TMP16);
1039 vis_xor(REF_0, REF_2, TMP0);
44f54ceb 1040
bb270c08 1041 vis_xor(REF_4, REF_6, TMP2);
44f54ceb 1042
bb270c08 1043 vis_or(REF_0, REF_2, TMP20);
44f54ceb 1044
bb270c08 1045 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1046
bb270c08 1047 vis_and(TMP16, MASK_7f, TMP16);
44f54ceb 1048
bb270c08
DB
1049 vis_psub16(TMP14, TMP12, TMP12);
1050 vis_st64(TMP12, dest[0]);
44f54ceb 1051
bb270c08
DB
1052 vis_psub16(TMP18, TMP16, TMP16);
1053 vis_st64_2(TMP16, dest, 8);
1054 dest += stride;
44f54ceb 1055
bb270c08 1056 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1057
bb270c08 1058 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 1059
bb270c08
DB
1060 vis_and(TMP2, MASK_fe, TMP2);
1061 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 1062
bb270c08
DB
1063 vis_faligndata(TMP6, TMP8, REF_2);
1064 vis_mul8x16(CONST_128, TMP2, TMP2);
44f54ceb 1065
bb270c08 1066 vis_faligndata(TMP8, TMP10, REF_6);
44f54ceb 1067
bb270c08 1068 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 1069
bb270c08 1070 vis_and(TMP2, MASK_7f, TMP2);
44f54ceb 1071
bb270c08
DB
1072 vis_psub16(TMP20, TMP0, TMP0);
1073 vis_st64(TMP0, dest[0]);
44f54ceb 1074
bb270c08
DB
1075 vis_psub16(TMP18, TMP2, TMP2);
1076 vis_st64_2(TMP2, dest, 8);
1077 dest += stride;
1078 } while (--height);
44f54ceb 1079
bb270c08
DB
1080 vis_ld64(ref[0], TMP0);
1081 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1082
bb270c08
DB
1083 vis_ld64_2(ref, 8, TMP2);
1084 vis_xor(REF_4, REF_6, TMP16);
44f54ceb 1085
bb270c08
DB
1086 vis_ld64_2(ref, 16, TMP4);
1087 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1088
bb270c08 1089 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1090
bb270c08 1091 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1092
bb270c08 1093 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1094
bb270c08 1095 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1096
bb270c08
DB
1097 vis_and(TMP16, MASK_fe, TMP16);
1098 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 1099
bb270c08
DB
1100 vis_mul8x16(CONST_128, TMP16, TMP16);
1101 vis_xor(REF_0, REF_2, TMP0);
44f54ceb 1102
bb270c08 1103 vis_xor(REF_4, REF_6, TMP2);
44f54ceb 1104
bb270c08 1105 vis_or(REF_0, REF_2, TMP20);
44f54ceb 1106
bb270c08 1107 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1108
bb270c08 1109 vis_and(TMP16, MASK_7f, TMP16);
44f54ceb 1110
bb270c08
DB
1111 vis_psub16(TMP14, TMP12, TMP12);
1112 vis_st64(TMP12, dest[0]);
44f54ceb 1113
bb270c08
DB
1114 vis_psub16(TMP18, TMP16, TMP16);
1115 vis_st64_2(TMP16, dest, 8);
1116 dest += stride;
44f54ceb 1117
bb270c08 1118 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1119
bb270c08 1120 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 1121
bb270c08
DB
1122 vis_and(TMP2, MASK_fe, TMP2);
1123 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 1124
bb270c08 1125 vis_mul8x16(CONST_128, TMP2, TMP2);
44f54ceb 1126
bb270c08 1127 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 1128
bb270c08 1129 vis_and(TMP2, MASK_7f, TMP2);
44f54ceb 1130
bb270c08
DB
1131 vis_psub16(TMP20, TMP0, TMP0);
1132 vis_st64(TMP0, dest[0]);
44f54ceb 1133
bb270c08
DB
1134 vis_psub16(TMP18, TMP2, TMP2);
1135 vis_st64_2(TMP2, dest, 8);
44f54ceb
MN
1136}
1137
86decad6 1138static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1139 const int stride, int height)
44f54ceb 1140{
bb270c08
DB
1141 ref = vis_alignaddr(ref);
1142 vis_ld64(ref[0], TMP0);
44f54ceb 1143
bb270c08
DB
1144 vis_ld64_2(ref, 8, TMP2);
1145 ref += stride;
44f54ceb 1146
bb270c08 1147 vis_ld64(ref[0], TMP4);
44f54ceb 1148
bb270c08
DB
1149 vis_ld64_2(ref, 8, TMP6);
1150 ref += stride;
44f54ceb 1151
bb270c08
DB
1152 vis_ld64(constants_fe[0], MASK_fe);
1153 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1154
bb270c08
DB
1155 vis_ld64(constants_7f[0], MASK_7f);
1156 vis_faligndata(TMP4, TMP6, REF_2);
44f54ceb 1157
bb270c08
DB
1158 vis_ld64(constants128[0], CONST_128);
1159 height = (height >> 1) - 1;
1160 do { /* 12 cycles */
1161 vis_ld64(ref[0], TMP0);
1162 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 1163
bb270c08
DB
1164 vis_ld64_2(ref, 8, TMP2);
1165 ref += stride;
1166 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 1167
bb270c08
DB
1168 vis_or(REF_0, REF_2, TMP6);
1169 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 1170
bb270c08
DB
1171 vis_faligndata(TMP0, TMP2, REF_0);
1172 vis_ld64(ref[0], TMP0);
44f54ceb 1173
bb270c08
DB
1174 vis_ld64_2(ref, 8, TMP2);
1175 ref += stride;
1176 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1177
bb270c08 1178 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 1179
bb270c08 1180 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1181
bb270c08
DB
1182 vis_mul8x16(CONST_128, TMP12, TMP12);
1183 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1184
bb270c08
DB
1185 vis_psub16(TMP6, TMP4, DST_0);
1186 vis_st64(DST_0, dest[0]);
1187 dest += stride;
44f54ceb 1188
bb270c08 1189 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1190
bb270c08 1191 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1192
bb270c08
DB
1193 vis_psub16(TMP14, TMP12, DST_0);
1194 vis_st64(DST_0, dest[0]);
1195 dest += stride;
1196 } while (--height);
44f54ceb 1197
bb270c08
DB
1198 vis_ld64(ref[0], TMP0);
1199 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 1200
bb270c08
DB
1201 vis_ld64_2(ref, 8, TMP2);
1202 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 1203
bb270c08
DB
1204 vis_or(REF_0, REF_2, TMP6);
1205 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 1206
bb270c08 1207 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1208
bb270c08 1209 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1210
bb270c08 1211 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 1212
bb270c08 1213 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1214
bb270c08
DB
1215 vis_mul8x16(CONST_128, TMP12, TMP12);
1216 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1217
bb270c08
DB
1218 vis_psub16(TMP6, TMP4, DST_0);
1219 vis_st64(DST_0, dest[0]);
1220 dest += stride;
44f54ceb 1221
bb270c08 1222 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1223
bb270c08
DB
1224 vis_psub16(TMP14, TMP12, DST_0);
1225 vis_st64(DST_0, dest[0]);
44f54ceb
MN
1226}
1227
86decad6 1228static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1229 const int stride, int height)
44f54ceb 1230{
bb270c08
DB
1231 int stride_8 = stride + 8;
1232 int stride_16 = stride + 16;
44f54ceb 1233
bb270c08 1234 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1235
bb270c08 1236 ref = vis_alignaddr(ref);
44f54ceb 1237
bb270c08
DB
1238 vis_ld64(ref[ 0], TMP0);
1239 vis_fzero(ZERO);
44f54ceb 1240
bb270c08 1241 vis_ld64(ref[ 8], TMP2);
44f54ceb 1242
bb270c08 1243 vis_ld64(ref[16], TMP4);
44f54ceb 1244
bb270c08
DB
1245 vis_ld64(constants3[0], CONST_3);
1246 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1247
bb270c08
DB
1248 vis_ld64(constants256_512[0], CONST_256);
1249 vis_faligndata(TMP2, TMP4, REF_6);
1250 height >>= 1;
44f54ceb 1251
bb270c08
DB
1252 do { /* 31 cycles */
1253 vis_ld64_2(ref, stride, TMP0);
1254 vis_pmerge(ZERO, REF_2, TMP12);
1255 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
44f54ceb 1256
bb270c08
DB
1257 vis_ld64_2(ref, stride_8, TMP2);
1258 vis_pmerge(ZERO, REF_6, TMP16);
1259 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
44f54ceb 1260
bb270c08
DB
1261 vis_ld64_2(ref, stride_16, TMP4);
1262 ref += stride;
44f54ceb 1263
bb270c08
DB
1264 vis_ld64(dest[0], DST_0);
1265 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1266
bb270c08
DB
1267 vis_ld64_2(dest, 8, DST_2);
1268 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1269
bb270c08
DB
1270 vis_ld64_2(ref, stride, TMP6);
1271 vis_pmerge(ZERO, REF_0, TMP0);
1272 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 1273
bb270c08
DB
1274 vis_ld64_2(ref, stride_8, TMP8);
1275 vis_pmerge(ZERO, REF_4, TMP4);
44f54ceb 1276
bb270c08
DB
1277 vis_ld64_2(ref, stride_16, TMP10);
1278 ref += stride;
44f54ceb 1279
bb270c08
DB
1280 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
1281 vis_faligndata(TMP6, TMP8, REF_2);
1282 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
44f54ceb 1283
bb270c08
DB
1284 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
1285 vis_faligndata(TMP8, TMP10, REF_6);
1286 vis_mul8x16al(DST_0, CONST_512, TMP20);
44f54ceb 1287
bb270c08
DB
1288 vis_padd16(TMP0, CONST_3, TMP0);
1289 vis_mul8x16al(DST_1, CONST_512, TMP22);
44f54ceb 1290
bb270c08
DB
1291 vis_padd16(TMP2, CONST_3, TMP2);
1292 vis_mul8x16al(DST_2, CONST_512, TMP24);
44f54ceb 1293
bb270c08
DB
1294 vis_padd16(TMP4, CONST_3, TMP4);
1295 vis_mul8x16al(DST_3, CONST_512, TMP26);
44f54ceb 1296
bb270c08 1297 vis_padd16(TMP6, CONST_3, TMP6);
44f54ceb 1298
bb270c08
DB
1299 vis_padd16(TMP12, TMP20, TMP12);
1300 vis_mul8x16al(REF_S0, CONST_512, TMP20);
44f54ceb 1301
bb270c08
DB
1302 vis_padd16(TMP14, TMP22, TMP14);
1303 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
44f54ceb 1304
bb270c08
DB
1305 vis_padd16(TMP16, TMP24, TMP16);
1306 vis_mul8x16al(REF_S2, CONST_512, TMP24);
44f54ceb 1307
bb270c08
DB
1308 vis_padd16(TMP18, TMP26, TMP18);
1309 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
44f54ceb 1310
bb270c08
DB
1311 vis_padd16(TMP12, TMP0, TMP12);
1312 vis_mul8x16au(REF_2, CONST_256, TMP28);
44f54ceb 1313
bb270c08
DB
1314 vis_padd16(TMP14, TMP2, TMP14);
1315 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
44f54ceb 1316
bb270c08
DB
1317 vis_padd16(TMP16, TMP4, TMP16);
1318 vis_mul8x16au(REF_6, CONST_256, REF_S4);
44f54ceb 1319
bb270c08
DB
1320 vis_padd16(TMP18, TMP6, TMP18);
1321 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
44f54ceb 1322
bb270c08
DB
1323 vis_pack16(TMP12, DST_0);
1324 vis_padd16(TMP28, TMP0, TMP12);
44f54ceb 1325
bb270c08
DB
1326 vis_pack16(TMP14, DST_1);
1327 vis_st64(DST_0, dest[0]);
1328 vis_padd16(TMP30, TMP2, TMP14);
44f54ceb 1329
bb270c08
DB
1330 vis_pack16(TMP16, DST_2);
1331 vis_padd16(REF_S4, TMP4, TMP16);
44f54ceb 1332
bb270c08
DB
1333 vis_pack16(TMP18, DST_3);
1334 vis_st64_2(DST_2, dest, 8);
1335 dest += stride;
1336 vis_padd16(REF_S6, TMP6, TMP18);
44f54ceb 1337
bb270c08 1338 vis_padd16(TMP12, TMP20, TMP12);
44f54ceb 1339
bb270c08
DB
1340 vis_padd16(TMP14, TMP22, TMP14);
1341 vis_pack16(TMP12, DST_0);
44f54ceb 1342
bb270c08
DB
1343 vis_padd16(TMP16, TMP24, TMP16);
1344 vis_pack16(TMP14, DST_1);
1345 vis_st64(DST_0, dest[0]);
44f54ceb 1346
bb270c08
DB
1347 vis_padd16(TMP18, TMP26, TMP18);
1348 vis_pack16(TMP16, DST_2);
44f54ceb 1349
bb270c08
DB
1350 vis_pack16(TMP18, DST_3);
1351 vis_st64_2(DST_2, dest, 8);
1352 dest += stride;
1353 } while (--height);
44f54ceb
MN
1354}
1355
86decad6 1356static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1357 const int stride, int height)
44f54ceb 1358{
bb270c08 1359 int stride_8 = stride + 8;
44f54ceb 1360
bb270c08 1361 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1362
bb270c08 1363 ref = vis_alignaddr(ref);
44f54ceb 1364
bb270c08
DB
1365 vis_ld64(ref[ 0], TMP0);
1366 vis_fzero(ZERO);
44f54ceb 1367
bb270c08 1368 vis_ld64(ref[ 8], TMP2);
44f54ceb 1369
bb270c08
DB
1370 vis_ld64(constants3[0], CONST_3);
1371 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1372
bb270c08 1373 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 1374
bb270c08
DB
1375 height >>= 1;
1376 do { /* 20 cycles */
1377 vis_ld64_2(ref, stride, TMP0);
1378 vis_pmerge(ZERO, REF_2, TMP8);
1379 vis_mul8x16au(REF_2_1, CONST_256, TMP10);
44f54ceb 1380
bb270c08
DB
1381 vis_ld64_2(ref, stride_8, TMP2);
1382 ref += stride;
44f54ceb 1383
bb270c08 1384 vis_ld64(dest[0], DST_0);
44f54ceb 1385
bb270c08
DB
1386 vis_ld64_2(dest, stride, DST_2);
1387 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1388
bb270c08
DB
1389 vis_ld64_2(ref, stride, TMP4);
1390 vis_mul8x16al(DST_0, CONST_512, TMP16);
1391 vis_pmerge(ZERO, REF_0, TMP12);
44f54ceb 1392
bb270c08
DB
1393 vis_ld64_2(ref, stride_8, TMP6);
1394 ref += stride;
1395 vis_mul8x16al(DST_1, CONST_512, TMP18);
1396 vis_pmerge(ZERO, REF_0_1, TMP14);
44f54ceb 1397
bb270c08
DB
1398 vis_padd16(TMP12, CONST_3, TMP12);
1399 vis_mul8x16al(DST_2, CONST_512, TMP24);
44f54ceb 1400
bb270c08
DB
1401 vis_padd16(TMP14, CONST_3, TMP14);
1402 vis_mul8x16al(DST_3, CONST_512, TMP26);
44f54ceb 1403
bb270c08 1404 vis_faligndata(TMP4, TMP6, REF_2);
44f54ceb 1405
bb270c08 1406 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1407
bb270c08
DB
1408 vis_padd16(TMP10, TMP14, TMP10);
1409 vis_mul8x16au(REF_2, CONST_256, TMP20);
44f54ceb 1410
bb270c08
DB
1411 vis_padd16(TMP8, TMP16, TMP0);
1412 vis_mul8x16au(REF_2_1, CONST_256, TMP22);
44f54ceb 1413
bb270c08
DB
1414 vis_padd16(TMP10, TMP18, TMP2);
1415 vis_pack16(TMP0, DST_0);
44f54ceb 1416
bb270c08
DB
1417 vis_pack16(TMP2, DST_1);
1418 vis_st64(DST_0, dest[0]);
1419 dest += stride;
1420 vis_padd16(TMP12, TMP20, TMP12);
44f54ceb 1421
bb270c08 1422 vis_padd16(TMP14, TMP22, TMP14);
44f54ceb 1423
bb270c08 1424 vis_padd16(TMP12, TMP24, TMP0);
44f54ceb 1425
bb270c08
DB
1426 vis_padd16(TMP14, TMP26, TMP2);
1427 vis_pack16(TMP0, DST_2);
44f54ceb 1428
bb270c08
DB
1429 vis_pack16(TMP2, DST_3);
1430 vis_st64(DST_2, dest[0]);
1431 dest += stride;
1432 } while (--height);
44f54ceb
MN
1433}
1434
86decad6 1435static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1436 const int stride, int height)
44f54ceb 1437{
bb270c08
DB
1438 unsigned long off = (unsigned long) ref & 0x7;
1439 unsigned long off_plus_1 = off + 1;
1440 int stride_8 = stride + 8;
1441 int stride_16 = stride + 16;
44f54ceb 1442
bb270c08 1443 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1444
bb270c08 1445 ref = vis_alignaddr(ref);
44f54ceb 1446
bb270c08
DB
1447 vis_ld64(ref[ 0], TMP0);
1448 vis_fzero(ZERO);
44f54ceb 1449
bb270c08 1450 vis_ld64(ref[ 8], TMP2);
44f54ceb 1451
bb270c08 1452 vis_ld64(ref[16], TMP4);
44f54ceb 1453
bb270c08
DB
1454 vis_ld64(constants2[0], CONST_2);
1455 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1456
bb270c08
DB
1457 vis_ld64(constants256_512[0], CONST_256);
1458 vis_faligndata(TMP2, TMP4, REF_S4);
44f54ceb 1459
bb270c08
DB
1460 if (off != 0x7) {
1461 vis_alignaddr_g0((void *)off_plus_1);
1462 vis_faligndata(TMP0, TMP2, REF_S2);
1463 vis_faligndata(TMP2, TMP4, REF_S6);
1464 } else {
1465 vis_src1(TMP2, REF_S2);
1466 vis_src1(TMP4, REF_S6);
1467 }
44f54ceb 1468
bb270c08
DB
1469 height >>= 1;
1470 do {
1471 vis_ld64_2(ref, stride, TMP0);
1472 vis_mul8x16au(REF_S0, CONST_256, TMP12);
1473 vis_pmerge(ZERO, REF_S0_1, TMP14);
44f54ceb 1474
bb270c08 1475 vis_alignaddr_g0((void *)off);
44f54ceb 1476
bb270c08
DB
1477 vis_ld64_2(ref, stride_8, TMP2);
1478 vis_mul8x16au(REF_S2, CONST_256, TMP16);
1479 vis_pmerge(ZERO, REF_S2_1, TMP18);
44f54ceb 1480
bb270c08
DB
1481 vis_ld64_2(ref, stride_16, TMP4);
1482 ref += stride;
1483 vis_mul8x16au(REF_S4, CONST_256, TMP20);
1484 vis_pmerge(ZERO, REF_S4_1, TMP22);
44f54ceb 1485
bb270c08
DB
1486 vis_ld64_2(ref, stride, TMP6);
1487 vis_mul8x16au(REF_S6, CONST_256, TMP24);
1488 vis_pmerge(ZERO, REF_S6_1, TMP26);
44f54ceb 1489
bb270c08
DB
1490 vis_ld64_2(ref, stride_8, TMP8);
1491 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1492
bb270c08
DB
1493 vis_ld64_2(ref, stride_16, TMP10);
1494 ref += stride;
1495 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1496
bb270c08 1497 vis_faligndata(TMP6, TMP8, REF_S0);
44f54ceb 1498
bb270c08 1499 vis_faligndata(TMP8, TMP10, REF_S4);
44f54ceb 1500
bb270c08
DB
1501 if (off != 0x7) {
1502 vis_alignaddr_g0((void *)off_plus_1);
1503 vis_faligndata(TMP0, TMP2, REF_2);
1504 vis_faligndata(TMP2, TMP4, REF_6);
1505 vis_faligndata(TMP6, TMP8, REF_S2);
1506 vis_faligndata(TMP8, TMP10, REF_S6);
1507 } else {
1508 vis_src1(TMP2, REF_2);
1509 vis_src1(TMP4, REF_6);
1510 vis_src1(TMP8, REF_S2);
1511 vis_src1(TMP10, REF_S6);
1512 }
44f54ceb 1513
bb270c08
DB
1514 vis_mul8x16au(REF_0, CONST_256, TMP0);
1515 vis_pmerge(ZERO, REF_0_1, TMP2);
44f54ceb 1516
bb270c08
DB
1517 vis_mul8x16au(REF_2, CONST_256, TMP4);
1518 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 1519
bb270c08
DB
1520 vis_padd16(TMP0, CONST_2, TMP8);
1521 vis_mul8x16au(REF_4, CONST_256, TMP0);
44f54ceb 1522
bb270c08
DB
1523 vis_padd16(TMP2, CONST_2, TMP10);
1524 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
44f54ceb 1525
bb270c08
DB
1526 vis_padd16(TMP8, TMP4, TMP8);
1527 vis_mul8x16au(REF_6, CONST_256, TMP4);
44f54ceb 1528
bb270c08
DB
1529 vis_padd16(TMP10, TMP6, TMP10);
1530 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
44f54ceb 1531
bb270c08 1532 vis_padd16(TMP12, TMP8, TMP12);
44f54ceb 1533
bb270c08 1534 vis_padd16(TMP14, TMP10, TMP14);
44f54ceb 1535
bb270c08 1536 vis_padd16(TMP12, TMP16, TMP12);
44f54ceb 1537
bb270c08
DB
1538 vis_padd16(TMP14, TMP18, TMP14);
1539 vis_pack16(TMP12, DST_0);
44f54ceb 1540
bb270c08
DB
1541 vis_pack16(TMP14, DST_1);
1542 vis_st64(DST_0, dest[0]);
1543 vis_padd16(TMP0, CONST_2, TMP12);
44f54ceb 1544
bb270c08
DB
1545 vis_mul8x16au(REF_S0, CONST_256, TMP0);
1546 vis_padd16(TMP2, CONST_2, TMP14);
44f54ceb 1547
bb270c08
DB
1548 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
1549 vis_padd16(TMP12, TMP4, TMP12);
44f54ceb 1550
bb270c08
DB
1551 vis_mul8x16au(REF_S2, CONST_256, TMP4);
1552 vis_padd16(TMP14, TMP6, TMP14);
44f54ceb 1553
bb270c08
DB
1554 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
1555 vis_padd16(TMP20, TMP12, TMP20);
44f54ceb 1556
bb270c08 1557 vis_padd16(TMP22, TMP14, TMP22);
44f54ceb 1558
bb270c08 1559 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1560
bb270c08
DB
1561 vis_padd16(TMP22, TMP26, TMP22);
1562 vis_pack16(TMP20, DST_2);
44f54ceb 1563
bb270c08
DB
1564 vis_pack16(TMP22, DST_3);
1565 vis_st64_2(DST_2, dest, 8);
1566 dest += stride;
1567 vis_padd16(TMP0, TMP4, TMP24);
44f54ceb 1568
bb270c08
DB
1569 vis_mul8x16au(REF_S4, CONST_256, TMP0);
1570 vis_padd16(TMP2, TMP6, TMP26);
44f54ceb 1571
bb270c08
DB
1572 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
1573 vis_padd16(TMP24, TMP8, TMP24);
44f54ceb 1574
bb270c08
DB
1575 vis_padd16(TMP26, TMP10, TMP26);
1576 vis_pack16(TMP24, DST_0);
44f54ceb 1577
bb270c08
DB
1578 vis_pack16(TMP26, DST_1);
1579 vis_st64(DST_0, dest[0]);
1580 vis_pmerge(ZERO, REF_S6, TMP4);
44f54ceb 1581
bb270c08 1582 vis_pmerge(ZERO, REF_S6_1, TMP6);
44f54ceb 1583
bb270c08 1584 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 1585
bb270c08 1586 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 1587
bb270c08 1588 vis_padd16(TMP0, TMP12, TMP0);
44f54ceb 1589
bb270c08
DB
1590 vis_padd16(TMP2, TMP14, TMP2);
1591 vis_pack16(TMP0, DST_2);
44f54ceb 1592
bb270c08
DB
1593 vis_pack16(TMP2, DST_3);
1594 vis_st64_2(DST_2, dest, 8);
1595 dest += stride;
1596 } while (--height);
44f54ceb
MN
1597}
1598
86decad6 1599static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1600 const int stride, int height)
44f54ceb 1601{
bb270c08
DB
1602 unsigned long off = (unsigned long) ref & 0x7;
1603 unsigned long off_plus_1 = off + 1;
1604 int stride_8 = stride + 8;
44f54ceb 1605
bb270c08 1606 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1607
bb270c08 1608 ref = vis_alignaddr(ref);
44f54ceb 1609
bb270c08
DB
1610 vis_ld64(ref[ 0], TMP0);
1611 vis_fzero(ZERO);
44f54ceb 1612
bb270c08 1613 vis_ld64(ref[ 8], TMP2);
44f54ceb 1614
bb270c08 1615 vis_ld64(constants2[0], CONST_2);
44f54ceb 1616
bb270c08
DB
1617 vis_ld64(constants256_512[0], CONST_256);
1618 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1619
bb270c08
DB
1620 if (off != 0x7) {
1621 vis_alignaddr_g0((void *)off_plus_1);
1622 vis_faligndata(TMP0, TMP2, REF_S2);
1623 } else {
1624 vis_src1(TMP2, REF_S2);
1625 }
44f54ceb 1626
bb270c08
DB
1627 height >>= 1;
1628 do { /* 26 cycles */
1629 vis_ld64_2(ref, stride, TMP0);
1630 vis_mul8x16au(REF_S0, CONST_256, TMP8);
1631 vis_pmerge(ZERO, REF_S2, TMP12);
44f54ceb 1632
bb270c08 1633 vis_alignaddr_g0((void *)off);
44f54ceb 1634
bb270c08
DB
1635 vis_ld64_2(ref, stride_8, TMP2);
1636 ref += stride;
1637 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
1638 vis_pmerge(ZERO, REF_S2_1, TMP14);
44f54ceb 1639
bb270c08 1640 vis_ld64_2(ref, stride, TMP4);
44f54ceb 1641
bb270c08
DB
1642 vis_ld64_2(ref, stride_8, TMP6);
1643 ref += stride;
1644 vis_faligndata(TMP0, TMP2, REF_S4);
44f54ceb 1645
bb270c08 1646 vis_pmerge(ZERO, REF_S4, TMP18);
44f54ceb 1647
bb270c08 1648 vis_pmerge(ZERO, REF_S4_1, TMP20);
44f54ceb 1649
bb270c08 1650 vis_faligndata(TMP4, TMP6, REF_S0);
44f54ceb 1651
bb270c08
DB
1652 if (off != 0x7) {
1653 vis_alignaddr_g0((void *)off_plus_1);
1654 vis_faligndata(TMP0, TMP2, REF_S6);
1655 vis_faligndata(TMP4, TMP6, REF_S2);
1656 } else {
1657 vis_src1(TMP2, REF_S6);
1658 vis_src1(TMP6, REF_S2);
1659 }
44f54ceb 1660
bb270c08
DB
1661 vis_padd16(TMP18, CONST_2, TMP18);
1662 vis_mul8x16au(REF_S6, CONST_256, TMP22);
44f54ceb 1663
bb270c08
DB
1664 vis_padd16(TMP20, CONST_2, TMP20);
1665 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
44f54ceb 1666
bb270c08
DB
1667 vis_mul8x16au(REF_S0, CONST_256, TMP26);
1668 vis_pmerge(ZERO, REF_S0_1, TMP28);
44f54ceb 1669
bb270c08
DB
1670 vis_mul8x16au(REF_S2, CONST_256, TMP30);
1671 vis_padd16(TMP18, TMP22, TMP18);
44f54ceb 1672
bb270c08
DB
1673 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
1674 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1675
bb270c08 1676 vis_padd16(TMP8, TMP18, TMP8);
44f54ceb 1677
bb270c08 1678 vis_padd16(TMP10, TMP20, TMP10);
44f54ceb 1679
bb270c08 1680 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1681
bb270c08
DB
1682 vis_padd16(TMP10, TMP14, TMP10);
1683 vis_pack16(TMP8, DST_0);
44f54ceb 1684
bb270c08
DB
1685 vis_pack16(TMP10, DST_1);
1686 vis_st64(DST_0, dest[0]);
1687 dest += stride;
1688 vis_padd16(TMP18, TMP26, TMP18);
44f54ceb 1689
bb270c08 1690 vis_padd16(TMP20, TMP28, TMP20);
44f54ceb 1691
bb270c08 1692 vis_padd16(TMP18, TMP30, TMP18);
44f54ceb 1693
bb270c08
DB
1694 vis_padd16(TMP20, TMP32, TMP20);
1695 vis_pack16(TMP18, DST_2);
44f54ceb 1696
bb270c08
DB
1697 vis_pack16(TMP20, DST_3);
1698 vis_st64(DST_2, dest[0]);
1699 dest += stride;
1700 } while (--height);
44f54ceb
MN
1701}
1702
86decad6 1703static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1704 const int stride, int height)
44f54ceb 1705{
bb270c08
DB
1706 unsigned long off = (unsigned long) ref & 0x7;
1707 unsigned long off_plus_1 = off + 1;
1708 int stride_8 = stride + 8;
1709 int stride_16 = stride + 16;
44f54ceb 1710
bb270c08 1711 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1712
bb270c08 1713 ref = vis_alignaddr(ref);
44f54ceb 1714
bb270c08
DB
1715 vis_ld64(ref[ 0], TMP0);
1716 vis_fzero(ZERO);
44f54ceb 1717
bb270c08 1718 vis_ld64(ref[ 8], TMP2);
44f54ceb 1719
bb270c08 1720 vis_ld64(ref[16], TMP4);
44f54ceb 1721
bb270c08
DB
1722 vis_ld64(constants6[0], CONST_6);
1723 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1724
bb270c08
DB
1725 vis_ld64(constants256_1024[0], CONST_256);
1726 vis_faligndata(TMP2, TMP4, REF_S4);
44f54ceb 1727
bb270c08
DB
1728 if (off != 0x7) {
1729 vis_alignaddr_g0((void *)off_plus_1);
1730 vis_faligndata(TMP0, TMP2, REF_S2);
1731 vis_faligndata(TMP2, TMP4, REF_S6);
1732 } else {
1733 vis_src1(TMP2, REF_S2);
1734 vis_src1(TMP4, REF_S6);
1735 }
44f54ceb 1736
bb270c08
DB
1737 height >>= 1;
1738 do { /* 55 cycles */
1739 vis_ld64_2(ref, stride, TMP0);
1740 vis_mul8x16au(REF_S0, CONST_256, TMP12);
1741 vis_pmerge(ZERO, REF_S0_1, TMP14);
44f54ceb 1742
bb270c08 1743 vis_alignaddr_g0((void *)off);
44f54ceb 1744
bb270c08
DB
1745 vis_ld64_2(ref, stride_8, TMP2);
1746 vis_mul8x16au(REF_S2, CONST_256, TMP16);
1747 vis_pmerge(ZERO, REF_S2_1, TMP18);
44f54ceb 1748
bb270c08
DB
1749 vis_ld64_2(ref, stride_16, TMP4);
1750 ref += stride;
1751 vis_mul8x16au(REF_S4, CONST_256, TMP20);
1752 vis_pmerge(ZERO, REF_S4_1, TMP22);
44f54ceb 1753
bb270c08
DB
1754 vis_ld64_2(ref, stride, TMP6);
1755 vis_mul8x16au(REF_S6, CONST_256, TMP24);
1756 vis_pmerge(ZERO, REF_S6_1, TMP26);
44f54ceb 1757
bb270c08
DB
1758 vis_ld64_2(ref, stride_8, TMP8);
1759 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1760
bb270c08
DB
1761 vis_ld64_2(ref, stride_16, TMP10);
1762 ref += stride;
1763 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1764
bb270c08
DB
1765 vis_ld64(dest[0], DST_0);
1766 vis_faligndata(TMP6, TMP8, REF_S0);
44f54ceb 1767
bb270c08
DB
1768 vis_ld64_2(dest, 8, DST_2);
1769 vis_faligndata(TMP8, TMP10, REF_S4);
44f54ceb 1770
bb270c08
DB
1771 if (off != 0x7) {
1772 vis_alignaddr_g0((void *)off_plus_1);
1773 vis_faligndata(TMP0, TMP2, REF_2);
1774 vis_faligndata(TMP2, TMP4, REF_6);
1775 vis_faligndata(TMP6, TMP8, REF_S2);
1776 vis_faligndata(TMP8, TMP10, REF_S6);
1777 } else {
1778 vis_src1(TMP2, REF_2);
1779 vis_src1(TMP4, REF_6);
1780 vis_src1(TMP8, REF_S2);
1781 vis_src1(TMP10, REF_S6);
1782 }
44f54ceb 1783
bb270c08
DB
1784 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1785 vis_pmerge(ZERO, REF_0, TMP0);
44f54ceb 1786
bb270c08
DB
1787 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1788 vis_pmerge(ZERO, REF_0_1, TMP2);
44f54ceb 1789
bb270c08
DB
1790 vis_mul8x16au(REF_2, CONST_256, TMP4);
1791 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 1792
bb270c08
DB
1793 vis_mul8x16al(DST_2, CONST_1024, REF_0);
1794 vis_padd16(TMP0, CONST_6, TMP0);
44f54ceb 1795
bb270c08
DB
1796 vis_mul8x16al(DST_3, CONST_1024, REF_2);
1797 vis_padd16(TMP2, CONST_6, TMP2);
44f54ceb 1798
bb270c08
DB
1799 vis_padd16(TMP0, TMP4, TMP0);
1800 vis_mul8x16au(REF_4, CONST_256, TMP4);
44f54ceb 1801
bb270c08
DB
1802 vis_padd16(TMP2, TMP6, TMP2);
1803 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
44f54ceb 1804
bb270c08
DB
1805 vis_padd16(TMP12, TMP0, TMP12);
1806 vis_mul8x16au(REF_6, CONST_256, TMP8);
44f54ceb 1807
bb270c08
DB
1808 vis_padd16(TMP14, TMP2, TMP14);
1809 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
44f54ceb 1810
bb270c08
DB
1811 vis_padd16(TMP12, TMP16, TMP12);
1812 vis_mul8x16au(REF_S0, CONST_256, REF_4);
44f54ceb 1813
bb270c08
DB
1814 vis_padd16(TMP14, TMP18, TMP14);
1815 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
44f54ceb 1816
bb270c08 1817 vis_padd16(TMP12, TMP30, TMP12);
44f54ceb 1818
bb270c08
DB
1819 vis_padd16(TMP14, TMP32, TMP14);
1820 vis_pack16(TMP12, DST_0);
44f54ceb 1821
bb270c08
DB
1822 vis_pack16(TMP14, DST_1);
1823 vis_st64(DST_0, dest[0]);
1824 vis_padd16(TMP4, CONST_6, TMP4);
44f54ceb 1825
bb270c08
DB
1826 vis_ld64_2(dest, stride, DST_0);
1827 vis_padd16(TMP6, CONST_6, TMP6);
1828 vis_mul8x16au(REF_S2, CONST_256, TMP12);
44f54ceb 1829
bb270c08
DB
1830 vis_padd16(TMP4, TMP8, TMP4);
1831 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
44f54ceb 1832
bb270c08 1833 vis_padd16(TMP6, TMP10, TMP6);
44f54ceb 1834
bb270c08 1835 vis_padd16(TMP20, TMP4, TMP20);
44f54ceb 1836
bb270c08 1837 vis_padd16(TMP22, TMP6, TMP22);
44f54ceb 1838
bb270c08 1839 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1840
bb270c08 1841 vis_padd16(TMP22, TMP26, TMP22);
44f54ceb 1842
bb270c08
DB
1843 vis_padd16(TMP20, REF_0, TMP20);
1844 vis_mul8x16au(REF_S4, CONST_256, REF_0);
44f54ceb 1845
bb270c08
DB
1846 vis_padd16(TMP22, REF_2, TMP22);
1847 vis_pack16(TMP20, DST_2);
44f54ceb 1848
bb270c08
DB
1849 vis_pack16(TMP22, DST_3);
1850 vis_st64_2(DST_2, dest, 8);
1851 dest += stride;
44f54ceb 1852
bb270c08
DB
1853 vis_ld64_2(dest, 8, DST_2);
1854 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1855 vis_pmerge(ZERO, REF_S4_1, REF_2);
44f54ceb 1856
bb270c08
DB
1857 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1858 vis_padd16(REF_4, TMP0, TMP8);
44f54ceb 1859
bb270c08
DB
1860 vis_mul8x16au(REF_S6, CONST_256, REF_4);
1861 vis_padd16(REF_6, TMP2, TMP10);
44f54ceb 1862
bb270c08
DB
1863 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
1864 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1865
bb270c08 1866 vis_padd16(TMP10, TMP14, TMP10);
44f54ceb 1867
bb270c08 1868 vis_padd16(TMP8, TMP30, TMP8);
44f54ceb 1869
bb270c08
DB
1870 vis_padd16(TMP10, TMP32, TMP10);
1871 vis_pack16(TMP8, DST_0);
44f54ceb 1872
bb270c08
DB
1873 vis_pack16(TMP10, DST_1);
1874 vis_st64(DST_0, dest[0]);
44f54ceb 1875
bb270c08 1876 vis_padd16(REF_0, TMP4, REF_0);
44f54ceb 1877
bb270c08
DB
1878 vis_mul8x16al(DST_2, CONST_1024, TMP30);
1879 vis_padd16(REF_2, TMP6, REF_2);
44f54ceb 1880
bb270c08
DB
1881 vis_mul8x16al(DST_3, CONST_1024, TMP32);
1882 vis_padd16(REF_0, REF_4, REF_0);
44f54ceb 1883
bb270c08 1884 vis_padd16(REF_2, REF_6, REF_2);
44f54ceb 1885
bb270c08 1886 vis_padd16(REF_0, TMP30, REF_0);
44f54ceb 1887
bb270c08 1888 /* stall */
44f54ceb 1889
bb270c08
DB
1890 vis_padd16(REF_2, TMP32, REF_2);
1891 vis_pack16(REF_0, DST_2);
44f54ceb 1892
bb270c08
DB
1893 vis_pack16(REF_2, DST_3);
1894 vis_st64_2(DST_2, dest, 8);
1895 dest += stride;
1896 } while (--height);
44f54ceb
MN
1897}
1898
86decad6 1899static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1900 const int stride, int height)
44f54ceb 1901{
bb270c08
DB
1902 unsigned long off = (unsigned long) ref & 0x7;
1903 unsigned long off_plus_1 = off + 1;
1904 int stride_8 = stride + 8;
44f54ceb 1905
bb270c08 1906 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1907
bb270c08 1908 ref = vis_alignaddr(ref);
44f54ceb 1909
bb270c08
DB
1910 vis_ld64(ref[0], TMP0);
1911 vis_fzero(ZERO);
44f54ceb 1912
bb270c08 1913 vis_ld64_2(ref, 8, TMP2);
44f54ceb 1914
bb270c08 1915 vis_ld64(constants6[0], CONST_6);
44f54ceb 1916
bb270c08
DB
1917 vis_ld64(constants256_1024[0], CONST_256);
1918 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1919
bb270c08
DB
1920 if (off != 0x7) {
1921 vis_alignaddr_g0((void *)off_plus_1);
1922 vis_faligndata(TMP0, TMP2, REF_S2);
1923 } else {
1924 vis_src1(TMP2, REF_S2);
1925 }
44f54ceb 1926
bb270c08
DB
1927 height >>= 1;
1928 do { /* 31 cycles */
1929 vis_ld64_2(ref, stride, TMP0);
1930 vis_mul8x16au(REF_S0, CONST_256, TMP8);
1931 vis_pmerge(ZERO, REF_S0_1, TMP10);
44f54ceb 1932
bb270c08
DB
1933 vis_ld64_2(ref, stride_8, TMP2);
1934 ref += stride;
1935 vis_mul8x16au(REF_S2, CONST_256, TMP12);
1936 vis_pmerge(ZERO, REF_S2_1, TMP14);
44f54ceb 1937
bb270c08 1938 vis_alignaddr_g0((void *)off);
44f54ceb 1939
bb270c08
DB
1940 vis_ld64_2(ref, stride, TMP4);
1941 vis_faligndata(TMP0, TMP2, REF_S4);
44f54ceb 1942
bb270c08
DB
1943 vis_ld64_2(ref, stride_8, TMP6);
1944 ref += stride;
44f54ceb 1945
bb270c08
DB
1946 vis_ld64(dest[0], DST_0);
1947 vis_faligndata(TMP4, TMP6, REF_S0);
44f54ceb 1948
bb270c08 1949 vis_ld64_2(dest, stride, DST_2);
44f54ceb 1950
bb270c08
DB
1951 if (off != 0x7) {
1952 vis_alignaddr_g0((void *)off_plus_1);
1953 vis_faligndata(TMP0, TMP2, REF_S6);
1954 vis_faligndata(TMP4, TMP6, REF_S2);
1955 } else {
1956 vis_src1(TMP2, REF_S6);
1957 vis_src1(TMP6, REF_S2);
1958 }
44f54ceb 1959
bb270c08
DB
1960 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1961 vis_pmerge(ZERO, REF_S4, TMP22);
44f54ceb 1962
bb270c08
DB
1963 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1964 vis_pmerge(ZERO, REF_S4_1, TMP24);
44f54ceb 1965
bb270c08
DB
1966 vis_mul8x16au(REF_S6, CONST_256, TMP26);
1967 vis_pmerge(ZERO, REF_S6_1, TMP28);
44f54ceb 1968
bb270c08
DB
1969 vis_mul8x16au(REF_S0, CONST_256, REF_S4);
1970 vis_padd16(TMP22, CONST_6, TMP22);
44f54ceb 1971
bb270c08
DB
1972 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
1973 vis_padd16(TMP24, CONST_6, TMP24);
44f54ceb 1974
bb270c08
DB
1975 vis_mul8x16al(DST_2, CONST_1024, REF_0);
1976 vis_padd16(TMP22, TMP26, TMP22);
44f54ceb 1977
bb270c08
DB
1978 vis_mul8x16al(DST_3, CONST_1024, REF_2);
1979 vis_padd16(TMP24, TMP28, TMP24);
44f54ceb 1980
bb270c08
DB
1981 vis_mul8x16au(REF_S2, CONST_256, TMP26);
1982 vis_padd16(TMP8, TMP22, TMP8);
44f54ceb 1983
bb270c08
DB
1984 vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
1985 vis_padd16(TMP10, TMP24, TMP10);
44f54ceb 1986
bb270c08 1987 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1988
bb270c08 1989 vis_padd16(TMP10, TMP14, TMP10);
44f54ceb 1990
bb270c08 1991 vis_padd16(TMP8, TMP30, TMP8);
44f54ceb 1992
bb270c08
DB
1993 vis_padd16(TMP10, TMP32, TMP10);
1994 vis_pack16(TMP8, DST_0);
44f54ceb 1995
bb270c08
DB
1996 vis_pack16(TMP10, DST_1);
1997 vis_st64(DST_0, dest[0]);
1998 dest += stride;
44f54ceb 1999
bb270c08 2000 vis_padd16(REF_S4, TMP22, TMP12);
44f54ceb 2001
bb270c08 2002 vis_padd16(REF_S6, TMP24, TMP14);
44f54ceb 2003
bb270c08 2004 vis_padd16(TMP12, TMP26, TMP12);
44f54ceb 2005
bb270c08 2006 vis_padd16(TMP14, TMP28, TMP14);
44f54ceb 2007
bb270c08 2008 vis_padd16(TMP12, REF_0, TMP12);
44f54ceb 2009
bb270c08
DB
2010 vis_padd16(TMP14, REF_2, TMP14);
2011 vis_pack16(TMP12, DST_2);
44f54ceb 2012
bb270c08
DB
2013 vis_pack16(TMP14, DST_3);
2014 vis_st64(DST_2, dest[0]);
2015 dest += stride;
2016 } while (--height);
44f54ceb
MN
2017}
2018
2019/* End of rounding code */
2020
2021/* Start of no rounding code */
2022/* The trick used in some of this file is the formula from the MMX
2023 * motion comp code, which is:
2024 *
2025 * (x+y)>>1 == (x&y)+((x^y)>>1)
2026 *
2027 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
2028 * We avoid overflows by masking before we do the shift, and we
2029 * implement the shift by multiplying by 1/2 using mul8x16. So in
2030 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
2031 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
2032 * the value 0x80808080 is in f8):
2033 *
bb270c08
DB
2034 * fxor f0, f2, f10
2035 * fand f10, f4, f10
2036 * fmul8x16 f8, f10, f10
2037 * fand f10, f6, f10
2038 * fand f0, f2, f12
2039 * fpadd16 f12, f10, f10
44f54ceb
MN
2040 */
2041
86decad6 2042static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2043 const int stride, int height)
44f54ceb 2044{
bb270c08
DB
2045 ref = vis_alignaddr(ref);
2046 do { /* 5 cycles */
2047 vis_ld64(ref[0], TMP0);
44f54ceb 2048
bb270c08 2049 vis_ld64_2(ref, 8, TMP2);
44f54ceb 2050
bb270c08
DB
2051 vis_ld64_2(ref, 16, TMP4);
2052 ref += stride;
44f54ceb 2053
bb270c08
DB
2054 vis_faligndata(TMP0, TMP2, REF_0);
2055 vis_st64(REF_0, dest[0]);
44f54ceb 2056
bb270c08
DB
2057 vis_faligndata(TMP2, TMP4, REF_2);
2058 vis_st64_2(REF_2, dest, 8);
2059 dest += stride;
2060 } while (--height);
44f54ceb
MN
2061}
2062
86decad6 2063static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2064 const int stride, int height)
44f54ceb 2065{
bb270c08
DB
2066 ref = vis_alignaddr(ref);
2067 do { /* 4 cycles */
2068 vis_ld64(ref[0], TMP0);
44f54ceb 2069
bb270c08
DB
2070 vis_ld64(ref[8], TMP2);
2071 ref += stride;
44f54ceb 2072
bb270c08 2073 /* stall */
44f54ceb 2074
bb270c08
DB
2075 vis_faligndata(TMP0, TMP2, REF_0);
2076 vis_st64(REF_0, dest[0]);
2077 dest += stride;
2078 } while (--height);
44f54ceb
MN
2079}
2080
2081
86decad6 2082static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2083 const int stride, int height)
44f54ceb 2084{
bb270c08 2085 int stride_8 = stride + 8;
44f54ceb 2086
bb270c08 2087 ref = vis_alignaddr(ref);
44f54ceb 2088
bb270c08 2089 vis_ld64(ref[0], TMP0);
44f54ceb 2090
bb270c08 2091 vis_ld64(ref[8], TMP2);
44f54ceb 2092
bb270c08 2093 vis_ld64(ref[16], TMP4);
44f54ceb 2094
bb270c08 2095 vis_ld64(dest[0], DST_0);
44f54ceb 2096
bb270c08 2097 vis_ld64(dest[8], DST_2);
44f54ceb 2098
bb270c08
DB
2099 vis_ld64(constants_fe[0], MASK_fe);
2100 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2101
bb270c08
DB
2102 vis_ld64(constants_7f[0], MASK_7f);
2103 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 2104
bb270c08 2105 vis_ld64(constants128[0], CONST_128);
44f54ceb 2106
bb270c08
DB
2107 ref += stride;
2108 height = (height >> 1) - 1;
44f54ceb 2109
bb270c08
DB
2110 do { /* 24 cycles */
2111 vis_ld64(ref[0], TMP0);
2112 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 2113
bb270c08
DB
2114 vis_ld64_2(ref, 8, TMP2);
2115 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2116
bb270c08
DB
2117 vis_ld64_2(ref, 16, TMP4);
2118 ref += stride;
2119 vis_mul8x16(CONST_128, TMP6, TMP6);
2120 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 2121
bb270c08 2122 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2123
bb270c08
DB
2124 vis_and(DST_0, REF_0, TMP10);
2125 vis_ld64_2(dest, stride, DST_0);
2126 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 2127
bb270c08
DB
2128 vis_and(DST_2, REF_2, TMP12);
2129 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 2130
bb270c08
DB
2131 vis_ld64(ref[0], TMP14);
2132 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2133
bb270c08 2134 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2135
bb270c08
DB
2136 vis_padd16(TMP10, TMP6, TMP6);
2137 vis_st64(TMP6, dest[0]);
44f54ceb 2138
bb270c08
DB
2139 vis_padd16(TMP12, TMP8, TMP8);
2140 vis_st64_2(TMP8, dest, 8);
44f54ceb 2141
bb270c08
DB
2142 dest += stride;
2143 vis_ld64_2(ref, 8, TMP16);
2144 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2145
bb270c08
DB
2146 vis_ld64_2(ref, 16, TMP18);
2147 vis_faligndata(TMP2, TMP4, REF_2);
2148 ref += stride;
44f54ceb 2149
bb270c08 2150 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 2151
bb270c08 2152 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 2153
bb270c08
DB
2154 vis_xor(DST_2, REF_2, TMP22);
2155 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 2156
bb270c08 2157 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 2158
bb270c08
DB
2159 vis_and(DST_0, REF_0, TMP24);
2160 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 2161
bb270c08 2162 vis_and(DST_2, REF_2, TMP26);
44f54ceb 2163
bb270c08
DB
2164 vis_ld64_2(dest, stride, DST_0);
2165 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 2166
bb270c08
DB
2167 vis_ld64_2(dest, stride_8, DST_2);
2168 vis_faligndata(TMP16, TMP18, REF_2);
44f54ceb 2169
bb270c08 2170 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 2171
bb270c08 2172 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 2173
bb270c08
DB
2174 vis_padd16(TMP24, TMP20, TMP20);
2175 vis_st64(TMP20, dest[0]);
44f54ceb 2176
bb270c08
DB
2177 vis_padd16(TMP26, TMP22, TMP22);
2178 vis_st64_2(TMP22, dest, 8);
2179 dest += stride;
2180 } while (--height);
44f54ceb 2181
bb270c08
DB
2182 vis_ld64(ref[0], TMP0);
2183 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 2184
bb270c08
DB
2185 vis_ld64_2(ref, 8, TMP2);
2186 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2187
bb270c08
DB
2188 vis_ld64_2(ref, 16, TMP4);
2189 vis_mul8x16(CONST_128, TMP6, TMP6);
2190 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 2191
bb270c08 2192 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2193
bb270c08
DB
2194 vis_and(DST_0, REF_0, TMP10);
2195 vis_ld64_2(dest, stride, DST_0);
2196 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 2197
bb270c08
DB
2198 vis_and(DST_2, REF_2, TMP12);
2199 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 2200
bb270c08
DB
2201 vis_ld64(ref[0], TMP14);
2202 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2203
bb270c08 2204 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2205
bb270c08
DB
2206 vis_padd16(TMP10, TMP6, TMP6);
2207 vis_st64(TMP6, dest[0]);
44f54ceb 2208
bb270c08
DB
2209 vis_padd16(TMP12, TMP8, TMP8);
2210 vis_st64_2(TMP8, dest, 8);
44f54ceb 2211
bb270c08
DB
2212 dest += stride;
2213 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2214
bb270c08 2215 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 2216
bb270c08 2217 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 2218
bb270c08 2219 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 2220
bb270c08
DB
2221 vis_xor(DST_2, REF_2, TMP22);
2222 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 2223
bb270c08 2224 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 2225
bb270c08
DB
2226 vis_and(DST_0, REF_0, TMP24);
2227 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 2228
bb270c08 2229 vis_and(DST_2, REF_2, TMP26);
44f54ceb 2230
bb270c08 2231 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 2232
bb270c08 2233 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 2234
bb270c08
DB
2235 vis_padd16(TMP24, TMP20, TMP20);
2236 vis_st64(TMP20, dest[0]);
44f54ceb 2237
bb270c08
DB
2238 vis_padd16(TMP26, TMP22, TMP22);
2239 vis_st64_2(TMP22, dest, 8);
44f54ceb
MN
2240}
2241
86decad6 2242static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2243 const int stride, int height)
44f54ceb 2244{
bb270c08 2245 ref = vis_alignaddr(ref);
44f54ceb 2246
bb270c08 2247 vis_ld64(ref[0], TMP0);
44f54ceb 2248
bb270c08 2249 vis_ld64(ref[8], TMP2);
44f54ceb 2250
bb270c08 2251 vis_ld64(dest[0], DST_0);
44f54ceb 2252
bb270c08 2253 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2254
bb270c08
DB
2255 vis_ld64(constants_7f[0], MASK_7f);
2256 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2257
bb270c08 2258 vis_ld64(constants128[0], CONST_128);
44f54ceb 2259
bb270c08
DB
2260 ref += stride;
2261 height = (height >> 1) - 1;
44f54ceb 2262
bb270c08
DB
2263 do { /* 12 cycles */
2264 vis_ld64(ref[0], TMP0);
2265 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 2266
bb270c08
DB
2267 vis_ld64(ref[8], TMP2);
2268 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2269
bb270c08
DB
2270 vis_and(DST_0, REF_0, TMP6);
2271 vis_ld64_2(dest, stride, DST_0);
2272 ref += stride;
2273 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2274
bb270c08
DB
2275 vis_ld64(ref[0], TMP12);
2276 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2277
bb270c08
DB
2278 vis_ld64(ref[8], TMP2);
2279 vis_xor(DST_0, REF_0, TMP0);
2280 ref += stride;
44f54ceb 2281
bb270c08 2282 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 2283
bb270c08 2284 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2285
bb270c08
DB
2286 vis_padd16(TMP6, TMP4, TMP4);
2287 vis_st64(TMP4, dest[0]);
2288 dest += stride;
2289 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 2290
bb270c08
DB
2291 vis_and(DST_0, REF_0, TMP6);
2292 vis_ld64_2(dest, stride, DST_0);
44f54ceb 2293
bb270c08 2294 vis_faligndata(TMP12, TMP2, REF_0);
44f54ceb 2295
bb270c08 2296 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 2297
bb270c08
DB
2298 vis_padd16(TMP6, TMP0, TMP4);
2299 vis_st64(TMP4, dest[0]);
2300 dest += stride;
2301 } while (--height);
44f54ceb 2302
bb270c08
DB
2303 vis_ld64(ref[0], TMP0);
2304 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 2305
bb270c08
DB
2306 vis_ld64(ref[8], TMP2);
2307 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2308
bb270c08
DB
2309 vis_and(DST_0, REF_0, TMP6);
2310 vis_ld64_2(dest, stride, DST_0);
2311 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2312
bb270c08 2313 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2314
bb270c08 2315 vis_xor(DST_0, REF_0, TMP0);
44f54ceb 2316
bb270c08 2317 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 2318
bb270c08 2319 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2320
bb270c08
DB
2321 vis_padd16(TMP6, TMP4, TMP4);
2322 vis_st64(TMP4, dest[0]);
2323 dest += stride;
2324 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 2325
bb270c08 2326 vis_and(DST_0, REF_0, TMP6);
44f54ceb 2327
bb270c08 2328 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 2329
bb270c08
DB
2330 vis_padd16(TMP6, TMP0, TMP4);
2331 vis_st64(TMP4, dest[0]);
44f54ceb
MN
2332}
2333
86decad6 2334static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2335 const int stride, int height)
44f54ceb 2336{
bb270c08
DB
2337 unsigned long off = (unsigned long) ref & 0x7;
2338 unsigned long off_plus_1 = off + 1;
44f54ceb 2339
bb270c08 2340 ref = vis_alignaddr(ref);
44f54ceb 2341
bb270c08 2342 vis_ld64(ref[0], TMP0);
44f54ceb 2343
bb270c08 2344 vis_ld64_2(ref, 8, TMP2);
44f54ceb 2345
bb270c08 2346 vis_ld64_2(ref, 16, TMP4);
44f54ceb 2347
bb270c08 2348 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2349
bb270c08
DB
2350 vis_ld64(constants_7f[0], MASK_7f);
2351 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2352
bb270c08
DB
2353 vis_ld64(constants128[0], CONST_128);
2354 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2355
bb270c08
DB
2356 if (off != 0x7) {
2357 vis_alignaddr_g0((void *)off_plus_1);
2358 vis_faligndata(TMP0, TMP2, REF_2);
2359 vis_faligndata(TMP2, TMP4, REF_6);
2360 } else {
2361 vis_src1(TMP2, REF_2);
2362 vis_src1(TMP4, REF_6);
2363 }
44f54ceb 2364
bb270c08
DB
2365 ref += stride;
2366 height = (height >> 1) - 1;
44f54ceb 2367
bb270c08
DB
2368 do { /* 34 cycles */
2369 vis_ld64(ref[0], TMP0);
2370 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2371
bb270c08
DB
2372 vis_ld64_2(ref, 8, TMP2);
2373 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2374
bb270c08
DB
2375 vis_ld64_2(ref, 16, TMP4);
2376 vis_and(TMP6, MASK_fe, TMP6);
2377 ref += stride;
44f54ceb 2378
bb270c08
DB
2379 vis_ld64(ref[0], TMP14);
2380 vis_mul8x16(CONST_128, TMP6, TMP6);
2381 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2382
bb270c08
DB
2383 vis_ld64_2(ref, 8, TMP16);
2384 vis_mul8x16(CONST_128, TMP8, TMP8);
2385 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2386
bb270c08
DB
2387 vis_ld64_2(ref, 16, TMP18);
2388 ref += stride;
2389 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2390
bb270c08 2391 vis_alignaddr_g0((void *)off);
44f54ceb 2392
bb270c08 2393 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2394
bb270c08 2395 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2396
bb270c08
DB
2397 if (off != 0x7) {
2398 vis_alignaddr_g0((void *)off_plus_1);
2399 vis_faligndata(TMP0, TMP2, REF_2);
2400 vis_faligndata(TMP2, TMP4, REF_6);
2401 } else {
2402 vis_src1(TMP2, REF_2);
2403 vis_src1(TMP4, REF_6);
2404 }
44f54ceb 2405
bb270c08 2406 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2407
bb270c08 2408 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2409
bb270c08
DB
2410 vis_padd16(TMP10, TMP6, TMP6);
2411 vis_st64(TMP6, dest[0]);
44f54ceb 2412
bb270c08
DB
2413 vis_padd16(TMP12, TMP8, TMP8);
2414 vis_st64_2(TMP8, dest, 8);
2415 dest += stride;
44f54ceb 2416
bb270c08 2417 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2418
bb270c08 2419 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2420
bb270c08 2421 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2422
bb270c08
DB
2423 vis_mul8x16(CONST_128, TMP6, TMP6);
2424 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2425
bb270c08
DB
2426 vis_mul8x16(CONST_128, TMP8, TMP8);
2427 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2428
bb270c08 2429 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2430
bb270c08 2431 vis_alignaddr_g0((void *)off);
44f54ceb 2432
bb270c08 2433 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 2434
bb270c08 2435 vis_faligndata(TMP16, TMP18, REF_4);
44f54ceb 2436
bb270c08
DB
2437 if (off != 0x7) {
2438 vis_alignaddr_g0((void *)off_plus_1);
2439 vis_faligndata(TMP14, TMP16, REF_2);
2440 vis_faligndata(TMP16, TMP18, REF_6);
2441 } else {
2442 vis_src1(TMP16, REF_2);
2443 vis_src1(TMP18, REF_6);
2444 }
44f54ceb 2445
bb270c08 2446 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2447
bb270c08 2448 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2449
bb270c08
DB
2450 vis_padd16(TMP10, TMP6, TMP6);
2451 vis_st64(TMP6, dest[0]);
44f54ceb 2452
bb270c08
DB
2453 vis_padd16(TMP12, TMP8, TMP8);
2454 vis_st64_2(TMP8, dest, 8);
2455 dest += stride;
2456 } while (--height);
44f54ceb 2457
bb270c08
DB
2458 vis_ld64(ref[0], TMP0);
2459 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2460
bb270c08
DB
2461 vis_ld64_2(ref, 8, TMP2);
2462 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2463
bb270c08
DB
2464 vis_ld64_2(ref, 16, TMP4);
2465 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2466
bb270c08
DB
2467 vis_mul8x16(CONST_128, TMP6, TMP6);
2468 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2469
bb270c08
DB
2470 vis_mul8x16(CONST_128, TMP8, TMP8);
2471 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2472
bb270c08 2473 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2474
bb270c08 2475 vis_alignaddr_g0((void *)off);
44f54ceb 2476
bb270c08 2477 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2478
bb270c08 2479 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2480
bb270c08
DB
2481 if (off != 0x7) {
2482 vis_alignaddr_g0((void *)off_plus_1);
2483 vis_faligndata(TMP0, TMP2, REF_2);
2484 vis_faligndata(TMP2, TMP4, REF_6);
2485 } else {
2486 vis_src1(TMP2, REF_2);
2487 vis_src1(TMP4, REF_6);
2488 }
44f54ceb 2489
bb270c08 2490 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2491
bb270c08 2492 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2493
bb270c08
DB
2494 vis_padd16(TMP10, TMP6, TMP6);
2495 vis_st64(TMP6, dest[0]);
44f54ceb 2496
bb270c08
DB
2497 vis_padd16(TMP12, TMP8, TMP8);
2498 vis_st64_2(TMP8, dest, 8);
2499 dest += stride;
44f54ceb 2500
bb270c08 2501 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2502
bb270c08 2503 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2504
bb270c08 2505 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2506
bb270c08
DB
2507 vis_mul8x16(CONST_128, TMP6, TMP6);
2508 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2509
bb270c08
DB
2510 vis_mul8x16(CONST_128, TMP8, TMP8);
2511 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2512
bb270c08 2513 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2514
bb270c08 2515 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2516
bb270c08 2517 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2518
bb270c08
DB
2519 vis_padd16(TMP10, TMP6, TMP6);
2520 vis_st64(TMP6, dest[0]);
44f54ceb 2521
bb270c08
DB
2522 vis_padd16(TMP12, TMP8, TMP8);
2523 vis_st64_2(TMP8, dest, 8);
44f54ceb
MN
2524}
2525
86decad6 2526static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2527 const int stride, int height)
44f54ceb 2528{
bb270c08
DB
2529 unsigned long off = (unsigned long) ref & 0x7;
2530 unsigned long off_plus_1 = off + 1;
44f54ceb 2531
bb270c08 2532 ref = vis_alignaddr(ref);
44f54ceb 2533
bb270c08 2534 vis_ld64(ref[0], TMP0);
44f54ceb 2535
bb270c08 2536 vis_ld64(ref[8], TMP2);
44f54ceb 2537
bb270c08 2538 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2539
bb270c08 2540 vis_ld64(constants_7f[0], MASK_7f);
44f54ceb 2541
bb270c08
DB
2542 vis_ld64(constants128[0], CONST_128);
2543 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2544
bb270c08
DB
2545 if (off != 0x7) {
2546 vis_alignaddr_g0((void *)off_plus_1);
2547 vis_faligndata(TMP0, TMP2, REF_2);
2548 } else {
2549 vis_src1(TMP2, REF_2);
2550 }
44f54ceb 2551
bb270c08
DB
2552 ref += stride;
2553 height = (height >> 1) - 1;
44f54ceb 2554
bb270c08
DB
2555 do { /* 20 cycles */
2556 vis_ld64(ref[0], TMP0);
2557 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 2558
bb270c08
DB
2559 vis_ld64_2(ref, 8, TMP2);
2560 vis_and(TMP4, MASK_fe, TMP4);
2561 ref += stride;
44f54ceb 2562
bb270c08
DB
2563 vis_ld64(ref[0], TMP8);
2564 vis_and(REF_0, REF_2, TMP6);
2565 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2566
bb270c08 2567 vis_alignaddr_g0((void *)off);
44f54ceb 2568
bb270c08
DB
2569 vis_ld64_2(ref, 8, TMP10);
2570 ref += stride;
2571 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2572
bb270c08
DB
2573 if (off != 0x7) {
2574 vis_alignaddr_g0((void *)off_plus_1);
2575 vis_faligndata(TMP0, TMP2, REF_2);
2576 } else {
2577 vis_src1(TMP2, REF_2);
2578 }
44f54ceb 2579
bb270c08 2580 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2581
bb270c08
DB
2582 vis_padd16(TMP6, TMP4, DST_0);
2583 vis_st64(DST_0, dest[0]);
2584 dest += stride;
44f54ceb 2585
bb270c08 2586 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 2587
bb270c08 2588 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 2589
bb270c08
DB
2590 vis_and(REF_0, REF_2, TMP14);
2591 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 2592
bb270c08
DB
2593 vis_alignaddr_g0((void *)off);
2594 vis_faligndata(TMP8, TMP10, REF_0);
2595 if (off != 0x7) {
2596 vis_alignaddr_g0((void *)off_plus_1);
2597 vis_faligndata(TMP8, TMP10, REF_2);
2598 } else {
2599 vis_src1(TMP10, REF_2);
2600 }
44f54ceb 2601
bb270c08 2602 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 2603
bb270c08
DB
2604 vis_padd16(TMP14, TMP12, DST_0);
2605 vis_st64(DST_0, dest[0]);
2606 dest += stride;
2607 } while (--height);
44f54ceb 2608
bb270c08
DB
2609 vis_ld64(ref[0], TMP0);
2610 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 2611
bb270c08
DB
2612 vis_ld64_2(ref, 8, TMP2);
2613 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2614
bb270c08
DB
2615 vis_and(REF_0, REF_2, TMP6);
2616 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2617
bb270c08 2618 vis_alignaddr_g0((void *)off);
44f54ceb 2619
bb270c08 2620 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2621
bb270c08
DB
2622 if (off != 0x7) {
2623 vis_alignaddr_g0((void *)off_plus_1);
2624 vis_faligndata(TMP0, TMP2, REF_2);
2625 } else {
2626 vis_src1(TMP2, REF_2);
2627 }
44f54ceb 2628
bb270c08 2629 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2630
bb270c08
DB
2631 vis_padd16(TMP6, TMP4, DST_0);
2632 vis_st64(DST_0, dest[0]);
2633 dest += stride;
44f54ceb 2634
bb270c08 2635 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 2636
bb270c08 2637 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 2638
bb270c08
DB
2639 vis_and(REF_0, REF_2, TMP14);
2640 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 2641
bb270c08 2642 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 2643
bb270c08
DB
2644 vis_padd16(TMP14, TMP12, DST_0);
2645 vis_st64(DST_0, dest[0]);
2646 dest += stride;
44f54ceb
MN
2647}
2648
86decad6 2649static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2650 const int stride, int height)
44f54ceb 2651{
bb270c08
DB
2652 unsigned long off = (unsigned long) ref & 0x7;
2653 unsigned long off_plus_1 = off + 1;
44f54ceb 2654
bb270c08 2655 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 2656
bb270c08
DB
2657 vis_ld64(constants3[0], CONST_3);
2658 vis_fzero(ZERO);
2659 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 2660
bb270c08
DB
2661 ref = vis_alignaddr(ref);
2662 do { /* 26 cycles */
2663 vis_ld64(ref[0], TMP0);
44f54ceb 2664
bb270c08 2665 vis_ld64(ref[8], TMP2);
44f54ceb 2666
bb270c08 2667 vis_alignaddr_g0((void *)off);
44f54ceb 2668
bb270c08 2669 vis_ld64(ref[16], TMP4);
44f54ceb 2670
bb270c08
DB
2671 vis_ld64(dest[0], DST_0);
2672 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2673
bb270c08
DB
2674 vis_ld64(dest[8], DST_2);
2675 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2676
bb270c08
DB
2677 if (off != 0x7) {
2678 vis_alignaddr_g0((void *)off_plus_1);
2679 vis_faligndata(TMP0, TMP2, REF_2);
2680 vis_faligndata(TMP2, TMP4, REF_6);
2681 } else {
2682 vis_src1(TMP2, REF_2);
2683 vis_src1(TMP4, REF_6);
2684 }
44f54ceb 2685
bb270c08 2686 vis_mul8x16au(REF_0, CONST_256, TMP0);
44f54ceb 2687
bb270c08
DB
2688 vis_pmerge(ZERO, REF_2, TMP4);
2689 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 2690
bb270c08 2691 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 2692
bb270c08 2693 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 2694
bb270c08
DB
2695 vis_mul8x16al(DST_0, CONST_512, TMP4);
2696 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 2697
bb270c08 2698 vis_mul8x16al(DST_1, CONST_512, TMP6);
44f54ceb 2699
bb270c08 2700 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 2701
bb270c08
DB
2702 vis_padd16(TMP0, TMP4, TMP0);
2703 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 2704
bb270c08
DB
2705 vis_padd16(TMP2, TMP6, TMP2);
2706 vis_mul8x16au(REF_4, CONST_256, TMP16);
44f54ceb 2707
bb270c08
DB
2708 vis_padd16(TMP0, CONST_3, TMP8);
2709 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
44f54ceb 2710
bb270c08
DB
2711 vis_padd16(TMP2, CONST_3, TMP10);
2712 vis_pack16(TMP8, DST_0);
44f54ceb 2713
bb270c08
DB
2714 vis_pack16(TMP10, DST_1);
2715 vis_padd16(TMP16, TMP12, TMP0);
44f54ceb 2716
bb270c08
DB
2717 vis_st64(DST_0, dest[0]);
2718 vis_mul8x16al(DST_2, CONST_512, TMP4);
2719 vis_padd16(TMP18, TMP14, TMP2);
44f54ceb 2720
bb270c08
DB
2721 vis_mul8x16al(DST_3, CONST_512, TMP6);
2722 vis_padd16(TMP0, CONST_3, TMP0);
44f54ceb 2723
bb270c08 2724 vis_padd16(TMP2, CONST_3, TMP2);
44f54ceb 2725
bb270c08 2726 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 2727
bb270c08
DB
2728 vis_padd16(TMP2, TMP6, TMP2);
2729 vis_pack16(TMP0, DST_2);
44f54ceb 2730
bb270c08
DB
2731 vis_pack16(TMP2, DST_3);
2732 vis_st64(DST_2, dest[8]);
44f54ceb 2733
bb270c08
DB
2734 ref += stride;
2735 dest += stride;
2736 } while (--height);
44f54ceb
MN
2737}
2738
86decad6 2739static void MC_avg_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2740 const int stride, int height)
44f54ceb 2741{
bb270c08
DB
2742 unsigned long off = (unsigned long) ref & 0x7;
2743 unsigned long off_plus_1 = off + 1;
2744 int stride_times_2 = stride << 1;
44f54ceb 2745
bb270c08 2746 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 2747
bb270c08
DB