Replace ffmpeg references with more accurate libav* references.
[libav.git] / libavcodec / sparc / dsputil_vis.c
CommitLineData
44f54ceb 1/*
44f54ceb
MN
2 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
3 *
2912e87a 4 * This file is part of Libav.
44f54ceb 5 *
2912e87a 6 * Libav is free software; you can redistribute it and/or
a33fe572
DB
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
44f54ceb 10 *
2912e87a 11 * Libav is distributed in the hope that it will be useful,
44f54ceb 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
a33fe572
DB
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
44f54ceb 15 *
a33fe572 16 * You should have received a copy of the GNU Lesser General Public
2912e87a 17 * License along with Libav; if not, write to the Free Software
5509bffa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
44f54ceb
MN
19 */
20
0f12310f 21/* The *no_round* functions have been added by James A. Morrison, 2003,2004.
2f5df0b1 22 The vis code from libmpeg2 was adapted for libavcodec by James A. Morrison.
44f54ceb
MN
23 */
24
25#include "config.h"
26
44f54ceb
MN
27#include <inttypes.h>
28
245976da 29#include "libavcodec/dsputil.h"
ad403802 30#include "dsputil_vis.h"
44f54ceb
MN
31
32#include "vis.h"
33
34/* The trick used in some of this file is the formula from the MMX
35 * motion comp code, which is:
36 *
37 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
38 *
39 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
40 * We avoid overflows by masking before we do the shift, and we
41 * implement the shift by multiplying by 1/2 using mul8x16. So in
42 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
43 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
44 * the value 0x80808080 is in f8):
45 *
bb270c08
DB
46 * fxor f0, f2, f10
47 * fand f10, f4, f10
48 * fmul8x16 f8, f10, f10
49 * fand f10, f6, f10
50 * for f0, f2, f12
51 * fpsub16 f12, f10, f10
44f54ceb
MN
52 */
53
44f54ceb
MN
54#define DUP4(x) {x, x, x, x}
55#define DUP8(x) {x, x, x, x, x, x, x, x}
d343d598
MR
56DECLARE_ALIGNED(8, static const int16_t, constants1)[] = DUP4 (1);
57DECLARE_ALIGNED(8, static const int16_t, constants2)[] = DUP4 (2);
58DECLARE_ALIGNED(8, static const int16_t, constants3)[] = DUP4 (3);
59DECLARE_ALIGNED(8, static const int16_t, constants6)[] = DUP4 (6);
60DECLARE_ALIGNED(8, static const int8_t, constants_fe)[] = DUP8 (0xfe);
61DECLARE_ALIGNED(8, static const int8_t, constants_7f)[] = DUP8 (0x7f);
62DECLARE_ALIGNED(8, static const int8_t, constants128)[] = DUP8 (128);
63DECLARE_ALIGNED(8, static const int16_t, constants256_512)[] =
bb270c08 64 {256, 512, 256, 512};
d343d598 65DECLARE_ALIGNED(8, static const int16_t, constants256_1024)[] =
bb270c08
DB
66 {256, 1024, 256, 1024};
67
68#define REF_0 0
69#define REF_0_1 1
70#define REF_2 2
71#define REF_2_1 3
72#define REF_4 4
73#define REF_4_1 5
74#define REF_6 6
75#define REF_6_1 7
76#define REF_S0 8
77#define REF_S0_1 9
78#define REF_S2 10
79#define REF_S2_1 11
80#define REF_S4 12
81#define REF_S4_1 13
82#define REF_S6 14
83#define REF_S6_1 15
84#define DST_0 16
85#define DST_1 17
86#define DST_2 18
87#define DST_3 19
88#define CONST_1 20
89#define CONST_2 20
90#define CONST_3 20
91#define CONST_6 20
92#define MASK_fe 20
93#define CONST_128 22
94#define CONST_256 22
95#define CONST_512 22
96#define CONST_1024 22
97#define TMP0 24
98#define TMP1 25
99#define TMP2 26
100#define TMP3 27
101#define TMP4 28
102#define TMP5 29
103#define ZERO 30
104#define MASK_7f 30
105
106#define TMP6 32
107#define TMP8 34
108#define TMP10 36
109#define TMP12 38
110#define TMP14 40
111#define TMP16 42
112#define TMP18 44
113#define TMP20 46
114#define TMP22 48
115#define TMP24 50
116#define TMP26 52
117#define TMP28 54
118#define TMP30 56
119#define TMP32 58
44f54ceb 120
86decad6 121static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 122 const int stride, int height)
44f54ceb 123{
bb270c08
DB
124 ref = vis_alignaddr(ref);
125 do { /* 5 cycles */
126 vis_ld64(ref[0], TMP0);
44f54ceb 127
bb270c08 128 vis_ld64_2(ref, 8, TMP2);
44f54ceb 129
bb270c08
DB
130 vis_ld64_2(ref, 16, TMP4);
131 ref += stride;
44f54ceb 132
bb270c08
DB
133 vis_faligndata(TMP0, TMP2, REF_0);
134 vis_st64(REF_0, dest[0]);
44f54ceb 135
bb270c08
DB
136 vis_faligndata(TMP2, TMP4, REF_2);
137 vis_st64_2(REF_2, dest, 8);
138 dest += stride;
139 } while (--height);
44f54ceb
MN
140}
141
86decad6 142static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 143 const int stride, int height)
44f54ceb 144{
bb270c08
DB
145 ref = vis_alignaddr(ref);
146 do { /* 4 cycles */
147 vis_ld64(ref[0], TMP0);
44f54ceb 148
bb270c08
DB
149 vis_ld64(ref[8], TMP2);
150 ref += stride;
44f54ceb 151
bb270c08 152 /* stall */
44f54ceb 153
bb270c08
DB
154 vis_faligndata(TMP0, TMP2, REF_0);
155 vis_st64(REF_0, dest[0]);
156 dest += stride;
157 } while (--height);
44f54ceb
MN
158}
159
160
86decad6 161static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 162 const int stride, int height)
44f54ceb 163{
bb270c08 164 int stride_8 = stride + 8;
44f54ceb 165
bb270c08 166 ref = vis_alignaddr(ref);
44f54ceb 167
bb270c08 168 vis_ld64(ref[0], TMP0);
44f54ceb 169
bb270c08 170 vis_ld64(ref[8], TMP2);
44f54ceb 171
bb270c08 172 vis_ld64(ref[16], TMP4);
44f54ceb 173
bb270c08 174 vis_ld64(dest[0], DST_0);
44f54ceb 175
bb270c08 176 vis_ld64(dest[8], DST_2);
44f54ceb 177
bb270c08
DB
178 vis_ld64(constants_fe[0], MASK_fe);
179 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 180
bb270c08
DB
181 vis_ld64(constants_7f[0], MASK_7f);
182 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 183
bb270c08 184 vis_ld64(constants128[0], CONST_128);
44f54ceb 185
bb270c08
DB
186 ref += stride;
187 height = (height >> 1) - 1;
44f54ceb 188
bb270c08
DB
189 do { /* 24 cycles */
190 vis_ld64(ref[0], TMP0);
191 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 192
bb270c08
DB
193 vis_ld64_2(ref, 8, TMP2);
194 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 195
bb270c08
DB
196 vis_ld64_2(ref, 16, TMP4);
197 ref += stride;
198 vis_mul8x16(CONST_128, TMP6, TMP6);
199 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 200
bb270c08 201 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 202
bb270c08
DB
203 vis_or(DST_0, REF_0, TMP10);
204 vis_ld64_2(dest, stride, DST_0);
205 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 206
bb270c08
DB
207 vis_or(DST_2, REF_2, TMP12);
208 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 209
bb270c08
DB
210 vis_ld64(ref[0], TMP14);
211 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 212
bb270c08 213 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 214
bb270c08
DB
215 vis_psub16(TMP10, TMP6, TMP6);
216 vis_st64(TMP6, dest[0]);
44f54ceb 217
bb270c08
DB
218 vis_psub16(TMP12, TMP8, TMP8);
219 vis_st64_2(TMP8, dest, 8);
44f54ceb 220
bb270c08
DB
221 dest += stride;
222 vis_ld64_2(ref, 8, TMP16);
223 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 224
bb270c08
DB
225 vis_ld64_2(ref, 16, TMP18);
226 vis_faligndata(TMP2, TMP4, REF_2);
227 ref += stride;
44f54ceb 228
bb270c08 229 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 230
bb270c08 231 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 232
bb270c08
DB
233 vis_xor(DST_2, REF_2, TMP22);
234 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 235
bb270c08 236 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 237
bb270c08
DB
238 vis_or(DST_0, REF_0, TMP24);
239 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 240
bb270c08 241 vis_or(DST_2, REF_2, TMP26);
44f54ceb 242
bb270c08
DB
243 vis_ld64_2(dest, stride, DST_0);
244 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 245
bb270c08
DB
246 vis_ld64_2(dest, stride_8, DST_2);
247 vis_faligndata(TMP16, TMP18, REF_2);
44f54ceb 248
bb270c08 249 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 250
bb270c08 251 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 252
bb270c08
DB
253 vis_psub16(TMP24, TMP20, TMP20);
254 vis_st64(TMP20, dest[0]);
44f54ceb 255
bb270c08
DB
256 vis_psub16(TMP26, TMP22, TMP22);
257 vis_st64_2(TMP22, dest, 8);
258 dest += stride;
259 } while (--height);
44f54ceb 260
bb270c08
DB
261 vis_ld64(ref[0], TMP0);
262 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 263
bb270c08
DB
264 vis_ld64_2(ref, 8, TMP2);
265 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 266
bb270c08
DB
267 vis_ld64_2(ref, 16, TMP4);
268 vis_mul8x16(CONST_128, TMP6, TMP6);
269 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 270
bb270c08 271 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 272
bb270c08
DB
273 vis_or(DST_0, REF_0, TMP10);
274 vis_ld64_2(dest, stride, DST_0);
275 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 276
bb270c08
DB
277 vis_or(DST_2, REF_2, TMP12);
278 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 279
bb270c08
DB
280 vis_ld64(ref[0], TMP14);
281 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 282
bb270c08 283 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 284
bb270c08
DB
285 vis_psub16(TMP10, TMP6, TMP6);
286 vis_st64(TMP6, dest[0]);
44f54ceb 287
bb270c08
DB
288 vis_psub16(TMP12, TMP8, TMP8);
289 vis_st64_2(TMP8, dest, 8);
44f54ceb 290
bb270c08
DB
291 dest += stride;
292 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 293
bb270c08 294 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 295
bb270c08 296 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 297
bb270c08 298 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 299
bb270c08
DB
300 vis_xor(DST_2, REF_2, TMP22);
301 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 302
bb270c08 303 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 304
bb270c08
DB
305 vis_or(DST_0, REF_0, TMP24);
306 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 307
bb270c08 308 vis_or(DST_2, REF_2, TMP26);
44f54ceb 309
bb270c08 310 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 311
bb270c08 312 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 313
bb270c08
DB
314 vis_psub16(TMP24, TMP20, TMP20);
315 vis_st64(TMP20, dest[0]);
44f54ceb 316
bb270c08
DB
317 vis_psub16(TMP26, TMP22, TMP22);
318 vis_st64_2(TMP22, dest, 8);
44f54ceb
MN
319}
320
86decad6 321static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 322 const int stride, int height)
44f54ceb 323{
bb270c08 324 ref = vis_alignaddr(ref);
44f54ceb 325
bb270c08 326 vis_ld64(ref[0], TMP0);
44f54ceb 327
bb270c08 328 vis_ld64(ref[8], TMP2);
44f54ceb 329
bb270c08 330 vis_ld64(dest[0], DST_0);
44f54ceb 331
bb270c08 332 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 333
bb270c08
DB
334 vis_ld64(constants_7f[0], MASK_7f);
335 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 336
bb270c08 337 vis_ld64(constants128[0], CONST_128);
44f54ceb 338
bb270c08
DB
339 ref += stride;
340 height = (height >> 1) - 1;
44f54ceb 341
bb270c08
DB
342 do { /* 12 cycles */
343 vis_ld64(ref[0], TMP0);
344 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 345
bb270c08
DB
346 vis_ld64(ref[8], TMP2);
347 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 348
bb270c08
DB
349 vis_or(DST_0, REF_0, TMP6);
350 vis_ld64_2(dest, stride, DST_0);
351 ref += stride;
352 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 353
bb270c08
DB
354 vis_ld64(ref[0], TMP12);
355 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 356
bb270c08
DB
357 vis_ld64(ref[8], TMP2);
358 vis_xor(DST_0, REF_0, TMP0);
359 ref += stride;
44f54ceb 360
bb270c08 361 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 362
bb270c08 363 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 364
bb270c08
DB
365 vis_psub16(TMP6, TMP4, TMP4);
366 vis_st64(TMP4, dest[0]);
367 dest += stride;
368 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 369
bb270c08
DB
370 vis_or(DST_0, REF_0, TMP6);
371 vis_ld64_2(dest, stride, DST_0);
44f54ceb 372
bb270c08 373 vis_faligndata(TMP12, TMP2, REF_0);
44f54ceb 374
bb270c08 375 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 376
bb270c08
DB
377 vis_psub16(TMP6, TMP0, TMP4);
378 vis_st64(TMP4, dest[0]);
379 dest += stride;
380 } while (--height);
44f54ceb 381
bb270c08
DB
382 vis_ld64(ref[0], TMP0);
383 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 384
bb270c08
DB
385 vis_ld64(ref[8], TMP2);
386 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 387
bb270c08
DB
388 vis_or(DST_0, REF_0, TMP6);
389 vis_ld64_2(dest, stride, DST_0);
390 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 391
bb270c08 392 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 393
bb270c08 394 vis_xor(DST_0, REF_0, TMP0);
44f54ceb 395
bb270c08 396 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 397
bb270c08 398 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 399
bb270c08
DB
400 vis_psub16(TMP6, TMP4, TMP4);
401 vis_st64(TMP4, dest[0]);
402 dest += stride;
403 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 404
bb270c08 405 vis_or(DST_0, REF_0, TMP6);
44f54ceb 406
bb270c08 407 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 408
bb270c08
DB
409 vis_psub16(TMP6, TMP0, TMP4);
410 vis_st64(TMP4, dest[0]);
44f54ceb
MN
411}
412
86decad6 413static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 414 const int stride, int height)
44f54ceb 415{
bb270c08
DB
416 unsigned long off = (unsigned long) ref & 0x7;
417 unsigned long off_plus_1 = off + 1;
44f54ceb 418
bb270c08 419 ref = vis_alignaddr(ref);
44f54ceb 420
bb270c08 421 vis_ld64(ref[0], TMP0);
44f54ceb 422
bb270c08 423 vis_ld64_2(ref, 8, TMP2);
44f54ceb 424
bb270c08 425 vis_ld64_2(ref, 16, TMP4);
44f54ceb 426
bb270c08 427 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 428
bb270c08
DB
429 vis_ld64(constants_7f[0], MASK_7f);
430 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 431
bb270c08
DB
432 vis_ld64(constants128[0], CONST_128);
433 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 434
bb270c08
DB
435 if (off != 0x7) {
436 vis_alignaddr_g0((void *)off_plus_1);
437 vis_faligndata(TMP0, TMP2, REF_2);
438 vis_faligndata(TMP2, TMP4, REF_6);
439 } else {
440 vis_src1(TMP2, REF_2);
441 vis_src1(TMP4, REF_6);
442 }
44f54ceb 443
bb270c08
DB
444 ref += stride;
445 height = (height >> 1) - 1;
44f54ceb 446
bb270c08
DB
447 do { /* 34 cycles */
448 vis_ld64(ref[0], TMP0);
449 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 450
bb270c08
DB
451 vis_ld64_2(ref, 8, TMP2);
452 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 453
bb270c08
DB
454 vis_ld64_2(ref, 16, TMP4);
455 vis_and(TMP6, MASK_fe, TMP6);
456 ref += stride;
44f54ceb 457
bb270c08
DB
458 vis_ld64(ref[0], TMP14);
459 vis_mul8x16(CONST_128, TMP6, TMP6);
460 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 461
bb270c08
DB
462 vis_ld64_2(ref, 8, TMP16);
463 vis_mul8x16(CONST_128, TMP8, TMP8);
464 vis_or(REF_0, REF_2, TMP10);
44f54ceb 465
bb270c08
DB
466 vis_ld64_2(ref, 16, TMP18);
467 ref += stride;
468 vis_or(REF_4, REF_6, TMP12);
44f54ceb 469
bb270c08 470 vis_alignaddr_g0((void *)off);
44f54ceb 471
bb270c08 472 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 473
bb270c08 474 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 475
bb270c08
DB
476 if (off != 0x7) {
477 vis_alignaddr_g0((void *)off_plus_1);
478 vis_faligndata(TMP0, TMP2, REF_2);
479 vis_faligndata(TMP2, TMP4, REF_6);
480 } else {
481 vis_src1(TMP2, REF_2);
482 vis_src1(TMP4, REF_6);
483 }
44f54ceb 484
bb270c08 485 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 486
bb270c08 487 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 488
bb270c08
DB
489 vis_psub16(TMP10, TMP6, TMP6);
490 vis_st64(TMP6, dest[0]);
44f54ceb 491
bb270c08
DB
492 vis_psub16(TMP12, TMP8, TMP8);
493 vis_st64_2(TMP8, dest, 8);
494 dest += stride;
44f54ceb 495
bb270c08 496 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 497
bb270c08 498 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 499
bb270c08 500 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 501
bb270c08
DB
502 vis_mul8x16(CONST_128, TMP6, TMP6);
503 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 504
bb270c08
DB
505 vis_mul8x16(CONST_128, TMP8, TMP8);
506 vis_or(REF_0, REF_2, TMP10);
44f54ceb 507
bb270c08 508 vis_or(REF_4, REF_6, TMP12);
44f54ceb 509
bb270c08 510 vis_alignaddr_g0((void *)off);
44f54ceb 511
bb270c08 512 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 513
bb270c08 514 vis_faligndata(TMP16, TMP18, REF_4);
44f54ceb 515
bb270c08
DB
516 if (off != 0x7) {
517 vis_alignaddr_g0((void *)off_plus_1);
518 vis_faligndata(TMP14, TMP16, REF_2);
519 vis_faligndata(TMP16, TMP18, REF_6);
520 } else {
521 vis_src1(TMP16, REF_2);
522 vis_src1(TMP18, REF_6);
523 }
44f54ceb 524
bb270c08 525 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 526
bb270c08 527 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 528
bb270c08
DB
529 vis_psub16(TMP10, TMP6, TMP6);
530 vis_st64(TMP6, dest[0]);
44f54ceb 531
bb270c08
DB
532 vis_psub16(TMP12, TMP8, TMP8);
533 vis_st64_2(TMP8, dest, 8);
534 dest += stride;
535 } while (--height);
44f54ceb 536
bb270c08
DB
537 vis_ld64(ref[0], TMP0);
538 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 539
bb270c08
DB
540 vis_ld64_2(ref, 8, TMP2);
541 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 542
bb270c08
DB
543 vis_ld64_2(ref, 16, TMP4);
544 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 545
bb270c08
DB
546 vis_mul8x16(CONST_128, TMP6, TMP6);
547 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 548
bb270c08
DB
549 vis_mul8x16(CONST_128, TMP8, TMP8);
550 vis_or(REF_0, REF_2, TMP10);
44f54ceb 551
bb270c08 552 vis_or(REF_4, REF_6, TMP12);
44f54ceb 553
bb270c08 554 vis_alignaddr_g0((void *)off);
44f54ceb 555
bb270c08 556 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 557
bb270c08 558 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 559
bb270c08
DB
560 if (off != 0x7) {
561 vis_alignaddr_g0((void *)off_plus_1);
562 vis_faligndata(TMP0, TMP2, REF_2);
563 vis_faligndata(TMP2, TMP4, REF_6);
564 } else {
565 vis_src1(TMP2, REF_2);
566 vis_src1(TMP4, REF_6);
567 }
44f54ceb 568
bb270c08 569 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 570
bb270c08 571 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 572
bb270c08
DB
573 vis_psub16(TMP10, TMP6, TMP6);
574 vis_st64(TMP6, dest[0]);
44f54ceb 575
bb270c08
DB
576 vis_psub16(TMP12, TMP8, TMP8);
577 vis_st64_2(TMP8, dest, 8);
578 dest += stride;
44f54ceb 579
bb270c08 580 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 581
bb270c08 582 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 583
bb270c08 584 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 585
bb270c08
DB
586 vis_mul8x16(CONST_128, TMP6, TMP6);
587 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 588
bb270c08
DB
589 vis_mul8x16(CONST_128, TMP8, TMP8);
590 vis_or(REF_0, REF_2, TMP10);
44f54ceb 591
bb270c08 592 vis_or(REF_4, REF_6, TMP12);
44f54ceb 593
bb270c08 594 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 595
bb270c08 596 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 597
bb270c08
DB
598 vis_psub16(TMP10, TMP6, TMP6);
599 vis_st64(TMP6, dest[0]);
44f54ceb 600
bb270c08
DB
601 vis_psub16(TMP12, TMP8, TMP8);
602 vis_st64_2(TMP8, dest, 8);
44f54ceb
MN
603}
604
86decad6 605static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 606 const int stride, int height)
44f54ceb 607{
bb270c08
DB
608 unsigned long off = (unsigned long) ref & 0x7;
609 unsigned long off_plus_1 = off + 1;
44f54ceb 610
bb270c08 611 ref = vis_alignaddr(ref);
44f54ceb 612
bb270c08 613 vis_ld64(ref[0], TMP0);
44f54ceb 614
bb270c08 615 vis_ld64(ref[8], TMP2);
44f54ceb 616
bb270c08 617 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 618
bb270c08 619 vis_ld64(constants_7f[0], MASK_7f);
44f54ceb 620
bb270c08
DB
621 vis_ld64(constants128[0], CONST_128);
622 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 623
bb270c08
DB
624 if (off != 0x7) {
625 vis_alignaddr_g0((void *)off_plus_1);
626 vis_faligndata(TMP0, TMP2, REF_2);
627 } else {
628 vis_src1(TMP2, REF_2);
629 }
44f54ceb 630
bb270c08
DB
631 ref += stride;
632 height = (height >> 1) - 1;
44f54ceb 633
bb270c08
DB
634 do { /* 20 cycles */
635 vis_ld64(ref[0], TMP0);
636 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 637
bb270c08
DB
638 vis_ld64_2(ref, 8, TMP2);
639 vis_and(TMP4, MASK_fe, TMP4);
640 ref += stride;
44f54ceb 641
bb270c08
DB
642 vis_ld64(ref[0], TMP8);
643 vis_or(REF_0, REF_2, TMP6);
644 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 645
bb270c08 646 vis_alignaddr_g0((void *)off);
44f54ceb 647
bb270c08
DB
648 vis_ld64_2(ref, 8, TMP10);
649 ref += stride;
650 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 651
bb270c08
DB
652 if (off != 0x7) {
653 vis_alignaddr_g0((void *)off_plus_1);
654 vis_faligndata(TMP0, TMP2, REF_2);
655 } else {
656 vis_src1(TMP2, REF_2);
657 }
44f54ceb 658
bb270c08 659 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 660
bb270c08
DB
661 vis_psub16(TMP6, TMP4, DST_0);
662 vis_st64(DST_0, dest[0]);
663 dest += stride;
44f54ceb 664
bb270c08 665 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 666
bb270c08 667 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 668
bb270c08
DB
669 vis_or(REF_0, REF_2, TMP14);
670 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 671
bb270c08
DB
672 vis_alignaddr_g0((void *)off);
673 vis_faligndata(TMP8, TMP10, REF_0);
674 if (off != 0x7) {
675 vis_alignaddr_g0((void *)off_plus_1);
676 vis_faligndata(TMP8, TMP10, REF_2);
677 } else {
678 vis_src1(TMP10, REF_2);
679 }
44f54ceb 680
bb270c08 681 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 682
bb270c08
DB
683 vis_psub16(TMP14, TMP12, DST_0);
684 vis_st64(DST_0, dest[0]);
685 dest += stride;
686 } while (--height);
44f54ceb 687
bb270c08
DB
688 vis_ld64(ref[0], TMP0);
689 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 690
bb270c08
DB
691 vis_ld64_2(ref, 8, TMP2);
692 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 693
bb270c08
DB
694 vis_or(REF_0, REF_2, TMP6);
695 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 696
bb270c08 697 vis_alignaddr_g0((void *)off);
44f54ceb 698
bb270c08 699 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 700
bb270c08
DB
701 if (off != 0x7) {
702 vis_alignaddr_g0((void *)off_plus_1);
703 vis_faligndata(TMP0, TMP2, REF_2);
704 } else {
705 vis_src1(TMP2, REF_2);
706 }
44f54ceb 707
bb270c08 708 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 709
bb270c08
DB
710 vis_psub16(TMP6, TMP4, DST_0);
711 vis_st64(DST_0, dest[0]);
712 dest += stride;
44f54ceb 713
bb270c08 714 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 715
bb270c08 716 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 717
bb270c08
DB
718 vis_or(REF_0, REF_2, TMP14);
719 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 720
bb270c08 721 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 722
bb270c08
DB
723 vis_psub16(TMP14, TMP12, DST_0);
724 vis_st64(DST_0, dest[0]);
725 dest += stride;
44f54ceb
MN
726}
727
86decad6 728static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 729 const int stride, int height)
44f54ceb 730{
bb270c08
DB
731 unsigned long off = (unsigned long) ref & 0x7;
732 unsigned long off_plus_1 = off + 1;
44f54ceb 733
bb270c08 734 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 735
bb270c08
DB
736 vis_ld64(constants3[0], CONST_3);
737 vis_fzero(ZERO);
738 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 739
bb270c08
DB
740 ref = vis_alignaddr(ref);
741 do { /* 26 cycles */
742 vis_ld64(ref[0], TMP0);
44f54ceb 743
bb270c08 744 vis_ld64(ref[8], TMP2);
44f54ceb 745
bb270c08 746 vis_alignaddr_g0((void *)off);
44f54ceb 747
bb270c08 748 vis_ld64(ref[16], TMP4);
44f54ceb 749
bb270c08
DB
750 vis_ld64(dest[0], DST_0);
751 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 752
bb270c08
DB
753 vis_ld64(dest[8], DST_2);
754 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 755
bb270c08
DB
756 if (off != 0x7) {
757 vis_alignaddr_g0((void *)off_plus_1);
758 vis_faligndata(TMP0, TMP2, REF_2);
759 vis_faligndata(TMP2, TMP4, REF_6);
760 } else {
761 vis_src1(TMP2, REF_2);
762 vis_src1(TMP4, REF_6);
763 }
44f54ceb 764
bb270c08 765 vis_mul8x16au(REF_0, CONST_256, TMP0);
44f54ceb 766
bb270c08
DB
767 vis_pmerge(ZERO, REF_2, TMP4);
768 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 769
bb270c08 770 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 771
bb270c08 772 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 773
bb270c08
DB
774 vis_mul8x16al(DST_0, CONST_512, TMP4);
775 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 776
bb270c08 777 vis_mul8x16al(DST_1, CONST_512, TMP6);
44f54ceb 778
bb270c08 779 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 780
bb270c08
DB
781 vis_padd16(TMP0, TMP4, TMP0);
782 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 783
bb270c08
DB
784 vis_padd16(TMP2, TMP6, TMP2);
785 vis_mul8x16au(REF_4, CONST_256, TMP16);
44f54ceb 786
bb270c08
DB
787 vis_padd16(TMP0, CONST_3, TMP8);
788 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
44f54ceb 789
bb270c08
DB
790 vis_padd16(TMP2, CONST_3, TMP10);
791 vis_pack16(TMP8, DST_0);
44f54ceb 792
bb270c08
DB
793 vis_pack16(TMP10, DST_1);
794 vis_padd16(TMP16, TMP12, TMP0);
44f54ceb 795
bb270c08
DB
796 vis_st64(DST_0, dest[0]);
797 vis_mul8x16al(DST_2, CONST_512, TMP4);
798 vis_padd16(TMP18, TMP14, TMP2);
44f54ceb 799
bb270c08
DB
800 vis_mul8x16al(DST_3, CONST_512, TMP6);
801 vis_padd16(TMP0, CONST_3, TMP0);
44f54ceb 802
bb270c08 803 vis_padd16(TMP2, CONST_3, TMP2);
44f54ceb 804
bb270c08 805 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 806
bb270c08
DB
807 vis_padd16(TMP2, TMP6, TMP2);
808 vis_pack16(TMP0, DST_2);
44f54ceb 809
bb270c08
DB
810 vis_pack16(TMP2, DST_3);
811 vis_st64(DST_2, dest[8]);
44f54ceb 812
bb270c08
DB
813 ref += stride;
814 dest += stride;
815 } while (--height);
44f54ceb
MN
816}
817
86decad6 818static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 819 const int stride, int height)
44f54ceb 820{
bb270c08
DB
821 unsigned long off = (unsigned long) ref & 0x7;
822 unsigned long off_plus_1 = off + 1;
823 int stride_times_2 = stride << 1;
44f54ceb 824
bb270c08 825 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 826
bb270c08
DB
827 vis_ld64(constants3[0], CONST_3);
828 vis_fzero(ZERO);
829 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 830
bb270c08
DB
831 ref = vis_alignaddr(ref);
832 height >>= 2;
833 do { /* 47 cycles */
834 vis_ld64(ref[0], TMP0);
44f54ceb 835
bb270c08
DB
836 vis_ld64_2(ref, 8, TMP2);
837 ref += stride;
44f54ceb 838
bb270c08 839 vis_alignaddr_g0((void *)off);
44f54ceb 840
bb270c08
DB
841 vis_ld64(ref[0], TMP4);
842 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 843
bb270c08
DB
844 vis_ld64_2(ref, 8, TMP6);
845 ref += stride;
44f54ceb 846
bb270c08 847 vis_ld64(ref[0], TMP8);
44f54ceb 848
bb270c08
DB
849 vis_ld64_2(ref, 8, TMP10);
850 ref += stride;
851 vis_faligndata(TMP4, TMP6, REF_4);
44f54ceb 852
bb270c08 853 vis_ld64(ref[0], TMP12);
44f54ceb 854
bb270c08
DB
855 vis_ld64_2(ref, 8, TMP14);
856 ref += stride;
857 vis_faligndata(TMP8, TMP10, REF_S0);
44f54ceb 858
bb270c08 859 vis_faligndata(TMP12, TMP14, REF_S4);
44f54ceb 860
bb270c08
DB
861 if (off != 0x7) {
862 vis_alignaddr_g0((void *)off_plus_1);
44f54ceb 863
bb270c08
DB
864 vis_ld64(dest[0], DST_0);
865 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 866
bb270c08
DB
867 vis_ld64_2(dest, stride, DST_2);
868 vis_faligndata(TMP4, TMP6, REF_6);
44f54ceb 869
bb270c08 870 vis_faligndata(TMP8, TMP10, REF_S2);
44f54ceb 871
bb270c08
DB
872 vis_faligndata(TMP12, TMP14, REF_S6);
873 } else {
874 vis_ld64(dest[0], DST_0);
875 vis_src1(TMP2, REF_2);
44f54ceb 876
bb270c08
DB
877 vis_ld64_2(dest, stride, DST_2);
878 vis_src1(TMP6, REF_6);
44f54ceb 879
bb270c08 880 vis_src1(TMP10, REF_S2);
44f54ceb 881
bb270c08
DB
882 vis_src1(TMP14, REF_S6);
883 }
44f54ceb 884
bb270c08
DB
885 vis_pmerge(ZERO, REF_0, TMP0);
886 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 887
bb270c08
DB
888 vis_pmerge(ZERO, REF_2, TMP4);
889 vis_mul8x16au(REF_2_1, CONST_256, TMP6);
44f54ceb 890
bb270c08
DB
891 vis_padd16(TMP0, CONST_3, TMP0);
892 vis_mul8x16al(DST_0, CONST_512, TMP16);
44f54ceb 893
bb270c08
DB
894 vis_padd16(TMP2, CONST_3, TMP2);
895 vis_mul8x16al(DST_1, CONST_512, TMP18);
44f54ceb 896
bb270c08
DB
897 vis_padd16(TMP0, TMP4, TMP0);
898 vis_mul8x16au(REF_4, CONST_256, TMP8);
44f54ceb 899
bb270c08
DB
900 vis_padd16(TMP2, TMP6, TMP2);
901 vis_mul8x16au(REF_4_1, CONST_256, TMP10);
44f54ceb 902
bb270c08
DB
903 vis_padd16(TMP0, TMP16, TMP0);
904 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 905
bb270c08
DB
906 vis_padd16(TMP2, TMP18, TMP2);
907 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 908
bb270c08
DB
909 vis_padd16(TMP8, CONST_3, TMP8);
910 vis_mul8x16al(DST_2, CONST_512, TMP16);
44f54ceb 911
bb270c08
DB
912 vis_padd16(TMP8, TMP12, TMP8);
913 vis_mul8x16al(DST_3, CONST_512, TMP18);
44f54ceb 914
bb270c08
DB
915 vis_padd16(TMP10, TMP14, TMP10);
916 vis_pack16(TMP0, DST_0);
44f54ceb 917
bb270c08
DB
918 vis_pack16(TMP2, DST_1);
919 vis_st64(DST_0, dest[0]);
920 dest += stride;
921 vis_padd16(TMP10, CONST_3, TMP10);
44f54ceb 922
bb270c08
DB
923 vis_ld64_2(dest, stride, DST_0);
924 vis_padd16(TMP8, TMP16, TMP8);
44f54ceb 925
bb270c08
DB
926 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
927 vis_padd16(TMP10, TMP18, TMP10);
928 vis_pack16(TMP8, DST_2);
44f54ceb 929
bb270c08
DB
930 vis_pack16(TMP10, DST_3);
931 vis_st64(DST_2, dest[0]);
932 dest += stride;
44f54ceb 933
bb270c08
DB
934 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
935 vis_pmerge(ZERO, REF_S0, TMP0);
44f54ceb 936
bb270c08
DB
937 vis_pmerge(ZERO, REF_S2, TMP24);
938 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
44f54ceb 939
bb270c08
DB
940 vis_padd16(TMP0, CONST_3, TMP0);
941 vis_mul8x16au(REF_S4, CONST_256, TMP8);
44f54ceb 942
bb270c08
DB
943 vis_padd16(TMP2, CONST_3, TMP2);
944 vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
44f54ceb 945
bb270c08
DB
946 vis_padd16(TMP0, TMP24, TMP0);
947 vis_mul8x16au(REF_S6, CONST_256, TMP12);
44f54ceb 948
bb270c08
DB
949 vis_padd16(TMP2, TMP6, TMP2);
950 vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
44f54ceb 951
bb270c08
DB
952 vis_padd16(TMP8, CONST_3, TMP8);
953 vis_mul8x16al(DST_0, CONST_512, TMP16);
44f54ceb 954
bb270c08
DB
955 vis_padd16(TMP10, CONST_3, TMP10);
956 vis_mul8x16al(DST_1, CONST_512, TMP18);
44f54ceb 957
bb270c08
DB
958 vis_padd16(TMP8, TMP12, TMP8);
959 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
44f54ceb 960
bb270c08
DB
961 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
962 vis_padd16(TMP0, TMP16, TMP0);
44f54ceb 963
bb270c08
DB
964 vis_padd16(TMP2, TMP18, TMP2);
965 vis_pack16(TMP0, DST_0);
44f54ceb 966
bb270c08
DB
967 vis_padd16(TMP10, TMP14, TMP10);
968 vis_pack16(TMP2, DST_1);
969 vis_st64(DST_0, dest[0]);
970 dest += stride;
44f54ceb 971
bb270c08 972 vis_padd16(TMP8, TMP20, TMP8);
44f54ceb 973
bb270c08
DB
974 vis_padd16(TMP10, TMP22, TMP10);
975 vis_pack16(TMP8, DST_2);
44f54ceb 976
bb270c08
DB
977 vis_pack16(TMP10, DST_3);
978 vis_st64(DST_2, dest[0]);
979 dest += stride;
980 } while (--height);
44f54ceb
MN
981}
982
86decad6 983static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 984 const int stride, int height)
44f54ceb 985{
bb270c08
DB
986 ref = vis_alignaddr(ref);
987 vis_ld64(ref[0], TMP0);
44f54ceb 988
bb270c08 989 vis_ld64_2(ref, 8, TMP2);
44f54ceb 990
bb270c08
DB
991 vis_ld64_2(ref, 16, TMP4);
992 ref += stride;
44f54ceb 993
bb270c08
DB
994 vis_ld64(ref[0], TMP6);
995 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 996
bb270c08
DB
997 vis_ld64_2(ref, 8, TMP8);
998 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 999
bb270c08
DB
1000 vis_ld64_2(ref, 16, TMP10);
1001 ref += stride;
44f54ceb 1002
bb270c08
DB
1003 vis_ld64(constants_fe[0], MASK_fe);
1004 vis_faligndata(TMP6, TMP8, REF_2);
44f54ceb 1005
bb270c08
DB
1006 vis_ld64(constants_7f[0], MASK_7f);
1007 vis_faligndata(TMP8, TMP10, REF_6);
44f54ceb 1008
bb270c08
DB
1009 vis_ld64(constants128[0], CONST_128);
1010 height = (height >> 1) - 1;
1011 do { /* 24 cycles */
1012 vis_ld64(ref[0], TMP0);
1013 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1014
bb270c08
DB
1015 vis_ld64_2(ref, 8, TMP2);
1016 vis_xor(REF_4, REF_6, TMP16);
44f54ceb 1017
bb270c08
DB
1018 vis_ld64_2(ref, 16, TMP4);
1019 ref += stride;
1020 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1021
bb270c08
DB
1022 vis_ld64(ref[0], TMP6);
1023 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1024
bb270c08
DB
1025 vis_ld64_2(ref, 8, TMP8);
1026 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1027
bb270c08
DB
1028 vis_ld64_2(ref, 16, TMP10);
1029 ref += stride;
1030 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1031
bb270c08 1032 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1033
bb270c08
DB
1034 vis_and(TMP16, MASK_fe, TMP16);
1035 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 1036
bb270c08
DB
1037 vis_mul8x16(CONST_128, TMP16, TMP16);
1038 vis_xor(REF_0, REF_2, TMP0);
44f54ceb 1039
bb270c08 1040 vis_xor(REF_4, REF_6, TMP2);
44f54ceb 1041
bb270c08 1042 vis_or(REF_0, REF_2, TMP20);
44f54ceb 1043
bb270c08 1044 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1045
bb270c08 1046 vis_and(TMP16, MASK_7f, TMP16);
44f54ceb 1047
bb270c08
DB
1048 vis_psub16(TMP14, TMP12, TMP12);
1049 vis_st64(TMP12, dest[0]);
44f54ceb 1050
bb270c08
DB
1051 vis_psub16(TMP18, TMP16, TMP16);
1052 vis_st64_2(TMP16, dest, 8);
1053 dest += stride;
44f54ceb 1054
bb270c08 1055 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1056
bb270c08 1057 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 1058
bb270c08
DB
1059 vis_and(TMP2, MASK_fe, TMP2);
1060 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 1061
bb270c08
DB
1062 vis_faligndata(TMP6, TMP8, REF_2);
1063 vis_mul8x16(CONST_128, TMP2, TMP2);
44f54ceb 1064
bb270c08 1065 vis_faligndata(TMP8, TMP10, REF_6);
44f54ceb 1066
bb270c08 1067 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 1068
bb270c08 1069 vis_and(TMP2, MASK_7f, TMP2);
44f54ceb 1070
bb270c08
DB
1071 vis_psub16(TMP20, TMP0, TMP0);
1072 vis_st64(TMP0, dest[0]);
44f54ceb 1073
bb270c08
DB
1074 vis_psub16(TMP18, TMP2, TMP2);
1075 vis_st64_2(TMP2, dest, 8);
1076 dest += stride;
1077 } while (--height);
44f54ceb 1078
bb270c08
DB
1079 vis_ld64(ref[0], TMP0);
1080 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1081
bb270c08
DB
1082 vis_ld64_2(ref, 8, TMP2);
1083 vis_xor(REF_4, REF_6, TMP16);
44f54ceb 1084
bb270c08
DB
1085 vis_ld64_2(ref, 16, TMP4);
1086 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1087
bb270c08 1088 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1089
bb270c08 1090 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1091
bb270c08 1092 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1093
bb270c08 1094 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1095
bb270c08
DB
1096 vis_and(TMP16, MASK_fe, TMP16);
1097 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 1098
bb270c08
DB
1099 vis_mul8x16(CONST_128, TMP16, TMP16);
1100 vis_xor(REF_0, REF_2, TMP0);
44f54ceb 1101
bb270c08 1102 vis_xor(REF_4, REF_6, TMP2);
44f54ceb 1103
bb270c08 1104 vis_or(REF_0, REF_2, TMP20);
44f54ceb 1105
bb270c08 1106 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1107
bb270c08 1108 vis_and(TMP16, MASK_7f, TMP16);
44f54ceb 1109
bb270c08
DB
1110 vis_psub16(TMP14, TMP12, TMP12);
1111 vis_st64(TMP12, dest[0]);
44f54ceb 1112
bb270c08
DB
1113 vis_psub16(TMP18, TMP16, TMP16);
1114 vis_st64_2(TMP16, dest, 8);
1115 dest += stride;
44f54ceb 1116
bb270c08 1117 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1118
bb270c08 1119 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 1120
bb270c08
DB
1121 vis_and(TMP2, MASK_fe, TMP2);
1122 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 1123
bb270c08 1124 vis_mul8x16(CONST_128, TMP2, TMP2);
44f54ceb 1125
bb270c08 1126 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 1127
bb270c08 1128 vis_and(TMP2, MASK_7f, TMP2);
44f54ceb 1129
bb270c08
DB
1130 vis_psub16(TMP20, TMP0, TMP0);
1131 vis_st64(TMP0, dest[0]);
44f54ceb 1132
bb270c08
DB
1133 vis_psub16(TMP18, TMP2, TMP2);
1134 vis_st64_2(TMP2, dest, 8);
44f54ceb
MN
1135}
1136
86decad6 1137static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1138 const int stride, int height)
44f54ceb 1139{
bb270c08
DB
1140 ref = vis_alignaddr(ref);
1141 vis_ld64(ref[0], TMP0);
44f54ceb 1142
bb270c08
DB
1143 vis_ld64_2(ref, 8, TMP2);
1144 ref += stride;
44f54ceb 1145
bb270c08 1146 vis_ld64(ref[0], TMP4);
44f54ceb 1147
bb270c08
DB
1148 vis_ld64_2(ref, 8, TMP6);
1149 ref += stride;
44f54ceb 1150
bb270c08
DB
1151 vis_ld64(constants_fe[0], MASK_fe);
1152 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1153
bb270c08
DB
1154 vis_ld64(constants_7f[0], MASK_7f);
1155 vis_faligndata(TMP4, TMP6, REF_2);
44f54ceb 1156
bb270c08
DB
1157 vis_ld64(constants128[0], CONST_128);
1158 height = (height >> 1) - 1;
1159 do { /* 12 cycles */
1160 vis_ld64(ref[0], TMP0);
1161 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 1162
bb270c08
DB
1163 vis_ld64_2(ref, 8, TMP2);
1164 ref += stride;
1165 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 1166
bb270c08
DB
1167 vis_or(REF_0, REF_2, TMP6);
1168 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 1169
bb270c08
DB
1170 vis_faligndata(TMP0, TMP2, REF_0);
1171 vis_ld64(ref[0], TMP0);
44f54ceb 1172
bb270c08
DB
1173 vis_ld64_2(ref, 8, TMP2);
1174 ref += stride;
1175 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1176
bb270c08 1177 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 1178
bb270c08 1179 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1180
bb270c08
DB
1181 vis_mul8x16(CONST_128, TMP12, TMP12);
1182 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1183
bb270c08
DB
1184 vis_psub16(TMP6, TMP4, DST_0);
1185 vis_st64(DST_0, dest[0]);
1186 dest += stride;
44f54ceb 1187
bb270c08 1188 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1189
bb270c08 1190 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1191
bb270c08
DB
1192 vis_psub16(TMP14, TMP12, DST_0);
1193 vis_st64(DST_0, dest[0]);
1194 dest += stride;
1195 } while (--height);
44f54ceb 1196
bb270c08
DB
1197 vis_ld64(ref[0], TMP0);
1198 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 1199
bb270c08
DB
1200 vis_ld64_2(ref, 8, TMP2);
1201 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 1202
bb270c08
DB
1203 vis_or(REF_0, REF_2, TMP6);
1204 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 1205
bb270c08 1206 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1207
bb270c08 1208 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1209
bb270c08 1210 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 1211
bb270c08 1212 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1213
bb270c08
DB
1214 vis_mul8x16(CONST_128, TMP12, TMP12);
1215 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1216
bb270c08
DB
1217 vis_psub16(TMP6, TMP4, DST_0);
1218 vis_st64(DST_0, dest[0]);
1219 dest += stride;
44f54ceb 1220
bb270c08 1221 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1222
bb270c08
DB
1223 vis_psub16(TMP14, TMP12, DST_0);
1224 vis_st64(DST_0, dest[0]);
44f54ceb
MN
1225}
1226
86decad6 1227static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1228 const int stride, int height)
44f54ceb 1229{
bb270c08
DB
1230 int stride_8 = stride + 8;
1231 int stride_16 = stride + 16;
44f54ceb 1232
bb270c08 1233 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1234
bb270c08 1235 ref = vis_alignaddr(ref);
44f54ceb 1236
bb270c08
DB
1237 vis_ld64(ref[ 0], TMP0);
1238 vis_fzero(ZERO);
44f54ceb 1239
bb270c08 1240 vis_ld64(ref[ 8], TMP2);
44f54ceb 1241
bb270c08 1242 vis_ld64(ref[16], TMP4);
44f54ceb 1243
bb270c08
DB
1244 vis_ld64(constants3[0], CONST_3);
1245 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1246
bb270c08
DB
1247 vis_ld64(constants256_512[0], CONST_256);
1248 vis_faligndata(TMP2, TMP4, REF_6);
1249 height >>= 1;
44f54ceb 1250
bb270c08
DB
1251 do { /* 31 cycles */
1252 vis_ld64_2(ref, stride, TMP0);
1253 vis_pmerge(ZERO, REF_2, TMP12);
1254 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
44f54ceb 1255
bb270c08
DB
1256 vis_ld64_2(ref, stride_8, TMP2);
1257 vis_pmerge(ZERO, REF_6, TMP16);
1258 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
44f54ceb 1259
bb270c08
DB
1260 vis_ld64_2(ref, stride_16, TMP4);
1261 ref += stride;
44f54ceb 1262
bb270c08
DB
1263 vis_ld64(dest[0], DST_0);
1264 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1265
bb270c08
DB
1266 vis_ld64_2(dest, 8, DST_2);
1267 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1268
bb270c08
DB
1269 vis_ld64_2(ref, stride, TMP6);
1270 vis_pmerge(ZERO, REF_0, TMP0);
1271 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 1272
bb270c08
DB
1273 vis_ld64_2(ref, stride_8, TMP8);
1274 vis_pmerge(ZERO, REF_4, TMP4);
44f54ceb 1275
bb270c08
DB
1276 vis_ld64_2(ref, stride_16, TMP10);
1277 ref += stride;
44f54ceb 1278
bb270c08
DB
1279 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
1280 vis_faligndata(TMP6, TMP8, REF_2);
1281 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
44f54ceb 1282
bb270c08
DB
1283 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
1284 vis_faligndata(TMP8, TMP10, REF_6);
1285 vis_mul8x16al(DST_0, CONST_512, TMP20);
44f54ceb 1286
bb270c08
DB
1287 vis_padd16(TMP0, CONST_3, TMP0);
1288 vis_mul8x16al(DST_1, CONST_512, TMP22);
44f54ceb 1289
bb270c08
DB
1290 vis_padd16(TMP2, CONST_3, TMP2);
1291 vis_mul8x16al(DST_2, CONST_512, TMP24);
44f54ceb 1292
bb270c08
DB
1293 vis_padd16(TMP4, CONST_3, TMP4);
1294 vis_mul8x16al(DST_3, CONST_512, TMP26);
44f54ceb 1295
bb270c08 1296 vis_padd16(TMP6, CONST_3, TMP6);
44f54ceb 1297
bb270c08
DB
1298 vis_padd16(TMP12, TMP20, TMP12);
1299 vis_mul8x16al(REF_S0, CONST_512, TMP20);
44f54ceb 1300
bb270c08
DB
1301 vis_padd16(TMP14, TMP22, TMP14);
1302 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
44f54ceb 1303
bb270c08
DB
1304 vis_padd16(TMP16, TMP24, TMP16);
1305 vis_mul8x16al(REF_S2, CONST_512, TMP24);
44f54ceb 1306
bb270c08
DB
1307 vis_padd16(TMP18, TMP26, TMP18);
1308 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
44f54ceb 1309
bb270c08
DB
1310 vis_padd16(TMP12, TMP0, TMP12);
1311 vis_mul8x16au(REF_2, CONST_256, TMP28);
44f54ceb 1312
bb270c08
DB
1313 vis_padd16(TMP14, TMP2, TMP14);
1314 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
44f54ceb 1315
bb270c08
DB
1316 vis_padd16(TMP16, TMP4, TMP16);
1317 vis_mul8x16au(REF_6, CONST_256, REF_S4);
44f54ceb 1318
bb270c08
DB
1319 vis_padd16(TMP18, TMP6, TMP18);
1320 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
44f54ceb 1321
bb270c08
DB
1322 vis_pack16(TMP12, DST_0);
1323 vis_padd16(TMP28, TMP0, TMP12);
44f54ceb 1324
bb270c08
DB
1325 vis_pack16(TMP14, DST_1);
1326 vis_st64(DST_0, dest[0]);
1327 vis_padd16(TMP30, TMP2, TMP14);
44f54ceb 1328
bb270c08
DB
1329 vis_pack16(TMP16, DST_2);
1330 vis_padd16(REF_S4, TMP4, TMP16);
44f54ceb 1331
bb270c08
DB
1332 vis_pack16(TMP18, DST_3);
1333 vis_st64_2(DST_2, dest, 8);
1334 dest += stride;
1335 vis_padd16(REF_S6, TMP6, TMP18);
44f54ceb 1336
bb270c08 1337 vis_padd16(TMP12, TMP20, TMP12);
44f54ceb 1338
bb270c08
DB
1339 vis_padd16(TMP14, TMP22, TMP14);
1340 vis_pack16(TMP12, DST_0);
44f54ceb 1341
bb270c08
DB
1342 vis_padd16(TMP16, TMP24, TMP16);
1343 vis_pack16(TMP14, DST_1);
1344 vis_st64(DST_0, dest[0]);
44f54ceb 1345
bb270c08
DB
1346 vis_padd16(TMP18, TMP26, TMP18);
1347 vis_pack16(TMP16, DST_2);
44f54ceb 1348
bb270c08
DB
1349 vis_pack16(TMP18, DST_3);
1350 vis_st64_2(DST_2, dest, 8);
1351 dest += stride;
1352 } while (--height);
44f54ceb
MN
1353}
1354
86decad6 1355static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1356 const int stride, int height)
44f54ceb 1357{
bb270c08 1358 int stride_8 = stride + 8;
44f54ceb 1359
bb270c08 1360 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1361
bb270c08 1362 ref = vis_alignaddr(ref);
44f54ceb 1363
bb270c08
DB
1364 vis_ld64(ref[ 0], TMP0);
1365 vis_fzero(ZERO);
44f54ceb 1366
bb270c08 1367 vis_ld64(ref[ 8], TMP2);
44f54ceb 1368
bb270c08
DB
1369 vis_ld64(constants3[0], CONST_3);
1370 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1371
bb270c08 1372 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 1373
bb270c08
DB
1374 height >>= 1;
1375 do { /* 20 cycles */
1376 vis_ld64_2(ref, stride, TMP0);
1377 vis_pmerge(ZERO, REF_2, TMP8);
1378 vis_mul8x16au(REF_2_1, CONST_256, TMP10);
44f54ceb 1379
bb270c08
DB
1380 vis_ld64_2(ref, stride_8, TMP2);
1381 ref += stride;
44f54ceb 1382
bb270c08 1383 vis_ld64(dest[0], DST_0);
44f54ceb 1384
bb270c08
DB
1385 vis_ld64_2(dest, stride, DST_2);
1386 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1387
bb270c08
DB
1388 vis_ld64_2(ref, stride, TMP4);
1389 vis_mul8x16al(DST_0, CONST_512, TMP16);
1390 vis_pmerge(ZERO, REF_0, TMP12);
44f54ceb 1391
bb270c08
DB
1392 vis_ld64_2(ref, stride_8, TMP6);
1393 ref += stride;
1394 vis_mul8x16al(DST_1, CONST_512, TMP18);
1395 vis_pmerge(ZERO, REF_0_1, TMP14);
44f54ceb 1396
bb270c08
DB
1397 vis_padd16(TMP12, CONST_3, TMP12);
1398 vis_mul8x16al(DST_2, CONST_512, TMP24);
44f54ceb 1399
bb270c08
DB
1400 vis_padd16(TMP14, CONST_3, TMP14);
1401 vis_mul8x16al(DST_3, CONST_512, TMP26);
44f54ceb 1402
bb270c08 1403 vis_faligndata(TMP4, TMP6, REF_2);
44f54ceb 1404
bb270c08 1405 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1406
bb270c08
DB
1407 vis_padd16(TMP10, TMP14, TMP10);
1408 vis_mul8x16au(REF_2, CONST_256, TMP20);
44f54ceb 1409
bb270c08
DB
1410 vis_padd16(TMP8, TMP16, TMP0);
1411 vis_mul8x16au(REF_2_1, CONST_256, TMP22);
44f54ceb 1412
bb270c08
DB
1413 vis_padd16(TMP10, TMP18, TMP2);
1414 vis_pack16(TMP0, DST_0);
44f54ceb 1415
bb270c08
DB
1416 vis_pack16(TMP2, DST_1);
1417 vis_st64(DST_0, dest[0]);
1418 dest += stride;
1419 vis_padd16(TMP12, TMP20, TMP12);
44f54ceb 1420
bb270c08 1421 vis_padd16(TMP14, TMP22, TMP14);
44f54ceb 1422
bb270c08 1423 vis_padd16(TMP12, TMP24, TMP0);
44f54ceb 1424
bb270c08
DB
1425 vis_padd16(TMP14, TMP26, TMP2);
1426 vis_pack16(TMP0, DST_2);
44f54ceb 1427
bb270c08
DB
1428 vis_pack16(TMP2, DST_3);
1429 vis_st64(DST_2, dest[0]);
1430 dest += stride;
1431 } while (--height);
44f54ceb
MN
1432}
1433
86decad6 1434static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1435 const int stride, int height)
44f54ceb 1436{
bb270c08
DB
1437 unsigned long off = (unsigned long) ref & 0x7;
1438 unsigned long off_plus_1 = off + 1;
1439 int stride_8 = stride + 8;
1440 int stride_16 = stride + 16;
44f54ceb 1441
bb270c08 1442 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1443
bb270c08 1444 ref = vis_alignaddr(ref);
44f54ceb 1445
bb270c08
DB
1446 vis_ld64(ref[ 0], TMP0);
1447 vis_fzero(ZERO);
44f54ceb 1448
bb270c08 1449 vis_ld64(ref[ 8], TMP2);
44f54ceb 1450
bb270c08 1451 vis_ld64(ref[16], TMP4);
44f54ceb 1452
bb270c08
DB
1453 vis_ld64(constants2[0], CONST_2);
1454 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1455
bb270c08
DB
1456 vis_ld64(constants256_512[0], CONST_256);
1457 vis_faligndata(TMP2, TMP4, REF_S4);
44f54ceb 1458
bb270c08
DB
1459 if (off != 0x7) {
1460 vis_alignaddr_g0((void *)off_plus_1);
1461 vis_faligndata(TMP0, TMP2, REF_S2);
1462 vis_faligndata(TMP2, TMP4, REF_S6);
1463 } else {
1464 vis_src1(TMP2, REF_S2);
1465 vis_src1(TMP4, REF_S6);
1466 }
44f54ceb 1467
bb270c08
DB
1468 height >>= 1;
1469 do {
1470 vis_ld64_2(ref, stride, TMP0);
1471 vis_mul8x16au(REF_S0, CONST_256, TMP12);
1472 vis_pmerge(ZERO, REF_S0_1, TMP14);
44f54ceb 1473
bb270c08 1474 vis_alignaddr_g0((void *)off);
44f54ceb 1475
bb270c08
DB
1476 vis_ld64_2(ref, stride_8, TMP2);
1477 vis_mul8x16au(REF_S2, CONST_256, TMP16);
1478 vis_pmerge(ZERO, REF_S2_1, TMP18);
44f54ceb 1479
bb270c08
DB
1480 vis_ld64_2(ref, stride_16, TMP4);
1481 ref += stride;
1482 vis_mul8x16au(REF_S4, CONST_256, TMP20);
1483 vis_pmerge(ZERO, REF_S4_1, TMP22);
44f54ceb 1484
bb270c08
DB
1485 vis_ld64_2(ref, stride, TMP6);
1486 vis_mul8x16au(REF_S6, CONST_256, TMP24);
1487 vis_pmerge(ZERO, REF_S6_1, TMP26);
44f54ceb 1488
bb270c08
DB
1489 vis_ld64_2(ref, stride_8, TMP8);
1490 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1491
bb270c08
DB
1492 vis_ld64_2(ref, stride_16, TMP10);
1493 ref += stride;
1494 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1495
bb270c08 1496 vis_faligndata(TMP6, TMP8, REF_S0);
44f54ceb 1497
bb270c08 1498 vis_faligndata(TMP8, TMP10, REF_S4);
44f54ceb 1499
bb270c08
DB
1500 if (off != 0x7) {
1501 vis_alignaddr_g0((void *)off_plus_1);
1502 vis_faligndata(TMP0, TMP2, REF_2);
1503 vis_faligndata(TMP2, TMP4, REF_6);
1504 vis_faligndata(TMP6, TMP8, REF_S2);
1505 vis_faligndata(TMP8, TMP10, REF_S6);
1506 } else {
1507 vis_src1(TMP2, REF_2);
1508 vis_src1(TMP4, REF_6);
1509 vis_src1(TMP8, REF_S2);
1510 vis_src1(TMP10, REF_S6);
1511 }
44f54ceb 1512
bb270c08
DB
1513 vis_mul8x16au(REF_0, CONST_256, TMP0);
1514 vis_pmerge(ZERO, REF_0_1, TMP2);
44f54ceb 1515
bb270c08
DB
1516 vis_mul8x16au(REF_2, CONST_256, TMP4);
1517 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 1518
bb270c08
DB
1519 vis_padd16(TMP0, CONST_2, TMP8);
1520 vis_mul8x16au(REF_4, CONST_256, TMP0);
44f54ceb 1521
bb270c08
DB
1522 vis_padd16(TMP2, CONST_2, TMP10);
1523 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
44f54ceb 1524
bb270c08
DB
1525 vis_padd16(TMP8, TMP4, TMP8);
1526 vis_mul8x16au(REF_6, CONST_256, TMP4);
44f54ceb 1527
bb270c08
DB
1528 vis_padd16(TMP10, TMP6, TMP10);
1529 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
44f54ceb 1530
bb270c08 1531 vis_padd16(TMP12, TMP8, TMP12);
44f54ceb 1532
bb270c08 1533 vis_padd16(TMP14, TMP10, TMP14);
44f54ceb 1534
bb270c08 1535 vis_padd16(TMP12, TMP16, TMP12);
44f54ceb 1536
bb270c08
DB
1537 vis_padd16(TMP14, TMP18, TMP14);
1538 vis_pack16(TMP12, DST_0);
44f54ceb 1539
bb270c08
DB
1540 vis_pack16(TMP14, DST_1);
1541 vis_st64(DST_0, dest[0]);
1542 vis_padd16(TMP0, CONST_2, TMP12);
44f54ceb 1543
bb270c08
DB
1544 vis_mul8x16au(REF_S0, CONST_256, TMP0);
1545 vis_padd16(TMP2, CONST_2, TMP14);
44f54ceb 1546
bb270c08
DB
1547 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
1548 vis_padd16(TMP12, TMP4, TMP12);
44f54ceb 1549
bb270c08
DB
1550 vis_mul8x16au(REF_S2, CONST_256, TMP4);
1551 vis_padd16(TMP14, TMP6, TMP14);
44f54ceb 1552
bb270c08
DB
1553 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
1554 vis_padd16(TMP20, TMP12, TMP20);
44f54ceb 1555
bb270c08 1556 vis_padd16(TMP22, TMP14, TMP22);
44f54ceb 1557
bb270c08 1558 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1559
bb270c08
DB
1560 vis_padd16(TMP22, TMP26, TMP22);
1561 vis_pack16(TMP20, DST_2);
44f54ceb 1562
bb270c08
DB
1563 vis_pack16(TMP22, DST_3);
1564 vis_st64_2(DST_2, dest, 8);
1565 dest += stride;
1566 vis_padd16(TMP0, TMP4, TMP24);
44f54ceb 1567
bb270c08
DB
1568 vis_mul8x16au(REF_S4, CONST_256, TMP0);
1569 vis_padd16(TMP2, TMP6, TMP26);
44f54ceb 1570
bb270c08
DB
1571 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
1572 vis_padd16(TMP24, TMP8, TMP24);
44f54ceb 1573
bb270c08
DB
1574 vis_padd16(TMP26, TMP10, TMP26);
1575 vis_pack16(TMP24, DST_0);
44f54ceb 1576
bb270c08
DB
1577 vis_pack16(TMP26, DST_1);
1578 vis_st64(DST_0, dest[0]);
1579 vis_pmerge(ZERO, REF_S6, TMP4);
44f54ceb 1580
bb270c08 1581 vis_pmerge(ZERO, REF_S6_1, TMP6);
44f54ceb 1582
bb270c08 1583 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 1584
bb270c08 1585 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 1586
bb270c08 1587 vis_padd16(TMP0, TMP12, TMP0);
44f54ceb 1588
bb270c08
DB
1589 vis_padd16(TMP2, TMP14, TMP2);
1590 vis_pack16(TMP0, DST_2);
44f54ceb 1591
bb270c08
DB
1592 vis_pack16(TMP2, DST_3);
1593 vis_st64_2(DST_2, dest, 8);
1594 dest += stride;
1595 } while (--height);
44f54ceb
MN
1596}
1597
86decad6 1598static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1599 const int stride, int height)
44f54ceb 1600{
bb270c08
DB
1601 unsigned long off = (unsigned long) ref & 0x7;
1602 unsigned long off_plus_1 = off + 1;
1603 int stride_8 = stride + 8;
44f54ceb 1604
bb270c08 1605 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1606
bb270c08 1607 ref = vis_alignaddr(ref);
44f54ceb 1608
bb270c08
DB
1609 vis_ld64(ref[ 0], TMP0);
1610 vis_fzero(ZERO);
44f54ceb 1611
bb270c08 1612 vis_ld64(ref[ 8], TMP2);
44f54ceb 1613
bb270c08 1614 vis_ld64(constants2[0], CONST_2);
44f54ceb 1615
bb270c08
DB
1616 vis_ld64(constants256_512[0], CONST_256);
1617 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1618
bb270c08
DB
1619 if (off != 0x7) {
1620 vis_alignaddr_g0((void *)off_plus_1);
1621 vis_faligndata(TMP0, TMP2, REF_S2);
1622 } else {
1623 vis_src1(TMP2, REF_S2);
1624 }
44f54ceb 1625
bb270c08
DB
1626 height >>= 1;
1627 do { /* 26 cycles */
1628 vis_ld64_2(ref, stride, TMP0);
1629 vis_mul8x16au(REF_S0, CONST_256, TMP8);
1630 vis_pmerge(ZERO, REF_S2, TMP12);
44f54ceb 1631
bb270c08 1632 vis_alignaddr_g0((void *)off);
44f54ceb 1633
bb270c08
DB
1634 vis_ld64_2(ref, stride_8, TMP2);
1635 ref += stride;
1636 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
1637 vis_pmerge(ZERO, REF_S2_1, TMP14);
44f54ceb 1638
bb270c08 1639 vis_ld64_2(ref, stride, TMP4);
44f54ceb 1640
bb270c08
DB
1641 vis_ld64_2(ref, stride_8, TMP6);
1642 ref += stride;
1643 vis_faligndata(TMP0, TMP2, REF_S4);
44f54ceb 1644
bb270c08 1645 vis_pmerge(ZERO, REF_S4, TMP18);
44f54ceb 1646
bb270c08 1647 vis_pmerge(ZERO, REF_S4_1, TMP20);
44f54ceb 1648
bb270c08 1649 vis_faligndata(TMP4, TMP6, REF_S0);
44f54ceb 1650
bb270c08
DB
1651 if (off != 0x7) {
1652 vis_alignaddr_g0((void *)off_plus_1);
1653 vis_faligndata(TMP0, TMP2, REF_S6);
1654 vis_faligndata(TMP4, TMP6, REF_S2);
1655 } else {
1656 vis_src1(TMP2, REF_S6);
1657 vis_src1(TMP6, REF_S2);
1658 }
44f54ceb 1659
bb270c08
DB
1660 vis_padd16(TMP18, CONST_2, TMP18);
1661 vis_mul8x16au(REF_S6, CONST_256, TMP22);
44f54ceb 1662
bb270c08
DB
1663 vis_padd16(TMP20, CONST_2, TMP20);
1664 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
44f54ceb 1665
bb270c08
DB
1666 vis_mul8x16au(REF_S0, CONST_256, TMP26);
1667 vis_pmerge(ZERO, REF_S0_1, TMP28);
44f54ceb 1668
bb270c08
DB
1669 vis_mul8x16au(REF_S2, CONST_256, TMP30);
1670 vis_padd16(TMP18, TMP22, TMP18);
44f54ceb 1671
bb270c08
DB
1672 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
1673 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1674
bb270c08 1675 vis_padd16(TMP8, TMP18, TMP8);
44f54ceb 1676
bb270c08 1677 vis_padd16(TMP10, TMP20, TMP10);
44f54ceb 1678
bb270c08 1679 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1680
bb270c08
DB
1681 vis_padd16(TMP10, TMP14, TMP10);
1682 vis_pack16(TMP8, DST_0);
44f54ceb 1683
bb270c08
DB
1684 vis_pack16(TMP10, DST_1);
1685 vis_st64(DST_0, dest[0]);
1686 dest += stride;
1687 vis_padd16(TMP18, TMP26, TMP18);
44f54ceb 1688
bb270c08 1689 vis_padd16(TMP20, TMP28, TMP20);
44f54ceb 1690
bb270c08 1691 vis_padd16(TMP18, TMP30, TMP18);
44f54ceb 1692
bb270c08
DB
1693 vis_padd16(TMP20, TMP32, TMP20);
1694 vis_pack16(TMP18, DST_2);
44f54ceb 1695
bb270c08
DB
1696 vis_pack16(TMP20, DST_3);
1697 vis_st64(DST_2, dest[0]);
1698 dest += stride;
1699 } while (--height);
44f54ceb
MN
1700}
1701
86decad6 1702static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1703 const int stride, int height)
44f54ceb 1704{
bb270c08
DB
1705 unsigned long off = (unsigned long) ref & 0x7;
1706 unsigned long off_plus_1 = off + 1;
1707 int stride_8 = stride + 8;
1708 int stride_16 = stride + 16;
44f54ceb 1709
bb270c08 1710 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1711
bb270c08 1712 ref = vis_alignaddr(ref);
44f54ceb 1713
bb270c08
DB
1714 vis_ld64(ref[ 0], TMP0);
1715 vis_fzero(ZERO);
44f54ceb 1716
bb270c08 1717 vis_ld64(ref[ 8], TMP2);
44f54ceb 1718
bb270c08 1719 vis_ld64(ref[16], TMP4);
44f54ceb 1720
bb270c08
DB
1721 vis_ld64(constants6[0], CONST_6);
1722 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1723
bb270c08
DB
1724 vis_ld64(constants256_1024[0], CONST_256);
1725 vis_faligndata(TMP2, TMP4, REF_S4);
44f54ceb 1726
bb270c08
DB
1727 if (off != 0x7) {
1728 vis_alignaddr_g0((void *)off_plus_1);
1729 vis_faligndata(TMP0, TMP2, REF_S2);
1730 vis_faligndata(TMP2, TMP4, REF_S6);
1731 } else {
1732 vis_src1(TMP2, REF_S2);
1733 vis_src1(TMP4, REF_S6);
1734 }
44f54ceb 1735
bb270c08
DB
1736 height >>= 1;
1737 do { /* 55 cycles */
1738 vis_ld64_2(ref, stride, TMP0);
1739 vis_mul8x16au(REF_S0, CONST_256, TMP12);
1740 vis_pmerge(ZERO, REF_S0_1, TMP14);
44f54ceb 1741
bb270c08 1742 vis_alignaddr_g0((void *)off);
44f54ceb 1743
bb270c08
DB
1744 vis_ld64_2(ref, stride_8, TMP2);
1745 vis_mul8x16au(REF_S2, CONST_256, TMP16);
1746 vis_pmerge(ZERO, REF_S2_1, TMP18);
44f54ceb 1747
bb270c08
DB
1748 vis_ld64_2(ref, stride_16, TMP4);
1749 ref += stride;
1750 vis_mul8x16au(REF_S4, CONST_256, TMP20);
1751 vis_pmerge(ZERO, REF_S4_1, TMP22);
44f54ceb 1752
bb270c08
DB
1753 vis_ld64_2(ref, stride, TMP6);
1754 vis_mul8x16au(REF_S6, CONST_256, TMP24);
1755 vis_pmerge(ZERO, REF_S6_1, TMP26);
44f54ceb 1756
bb270c08
DB
1757 vis_ld64_2(ref, stride_8, TMP8);
1758 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1759
bb270c08
DB
1760 vis_ld64_2(ref, stride_16, TMP10);
1761 ref += stride;
1762 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1763
bb270c08
DB
1764 vis_ld64(dest[0], DST_0);
1765 vis_faligndata(TMP6, TMP8, REF_S0);
44f54ceb 1766
bb270c08
DB
1767 vis_ld64_2(dest, 8, DST_2);
1768 vis_faligndata(TMP8, TMP10, REF_S4);
44f54ceb 1769
bb270c08
DB
1770 if (off != 0x7) {
1771 vis_alignaddr_g0((void *)off_plus_1);
1772 vis_faligndata(TMP0, TMP2, REF_2);
1773 vis_faligndata(TMP2, TMP4, REF_6);
1774 vis_faligndata(TMP6, TMP8, REF_S2);
1775 vis_faligndata(TMP8, TMP10, REF_S6);
1776 } else {
1777 vis_src1(TMP2, REF_2);
1778 vis_src1(TMP4, REF_6);
1779 vis_src1(TMP8, REF_S2);
1780 vis_src1(TMP10, REF_S6);
1781 }
44f54ceb 1782
bb270c08
DB
1783 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1784 vis_pmerge(ZERO, REF_0, TMP0);
44f54ceb 1785
bb270c08
DB
1786 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1787 vis_pmerge(ZERO, REF_0_1, TMP2);
44f54ceb 1788
bb270c08
DB
1789 vis_mul8x16au(REF_2, CONST_256, TMP4);
1790 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 1791
bb270c08
DB
1792 vis_mul8x16al(DST_2, CONST_1024, REF_0);
1793 vis_padd16(TMP0, CONST_6, TMP0);
44f54ceb 1794
bb270c08
DB
1795 vis_mul8x16al(DST_3, CONST_1024, REF_2);
1796 vis_padd16(TMP2, CONST_6, TMP2);
44f54ceb 1797
bb270c08
DB
1798 vis_padd16(TMP0, TMP4, TMP0);
1799 vis_mul8x16au(REF_4, CONST_256, TMP4);
44f54ceb 1800
bb270c08
DB
1801 vis_padd16(TMP2, TMP6, TMP2);
1802 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
44f54ceb 1803
bb270c08
DB
1804 vis_padd16(TMP12, TMP0, TMP12);
1805 vis_mul8x16au(REF_6, CONST_256, TMP8);
44f54ceb 1806
bb270c08
DB
1807 vis_padd16(TMP14, TMP2, TMP14);
1808 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
44f54ceb 1809
bb270c08
DB
1810 vis_padd16(TMP12, TMP16, TMP12);
1811 vis_mul8x16au(REF_S0, CONST_256, REF_4);
44f54ceb 1812
bb270c08
DB
1813 vis_padd16(TMP14, TMP18, TMP14);
1814 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
44f54ceb 1815
bb270c08 1816 vis_padd16(TMP12, TMP30, TMP12);
44f54ceb 1817
bb270c08
DB
1818 vis_padd16(TMP14, TMP32, TMP14);
1819 vis_pack16(TMP12, DST_0);
44f54ceb 1820
bb270c08
DB
1821 vis_pack16(TMP14, DST_1);
1822 vis_st64(DST_0, dest[0]);
1823 vis_padd16(TMP4, CONST_6, TMP4);
44f54ceb 1824
bb270c08
DB
1825 vis_ld64_2(dest, stride, DST_0);
1826 vis_padd16(TMP6, CONST_6, TMP6);
1827 vis_mul8x16au(REF_S2, CONST_256, TMP12);
44f54ceb 1828
bb270c08
DB
1829 vis_padd16(TMP4, TMP8, TMP4);
1830 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
44f54ceb 1831
bb270c08 1832 vis_padd16(TMP6, TMP10, TMP6);
44f54ceb 1833
bb270c08 1834 vis_padd16(TMP20, TMP4, TMP20);
44f54ceb 1835
bb270c08 1836 vis_padd16(TMP22, TMP6, TMP22);
44f54ceb 1837
bb270c08 1838 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1839
bb270c08 1840 vis_padd16(TMP22, TMP26, TMP22);
44f54ceb 1841
bb270c08
DB
1842 vis_padd16(TMP20, REF_0, TMP20);
1843 vis_mul8x16au(REF_S4, CONST_256, REF_0);
44f54ceb 1844
bb270c08
DB
1845 vis_padd16(TMP22, REF_2, TMP22);
1846 vis_pack16(TMP20, DST_2);
44f54ceb 1847
bb270c08
DB
1848 vis_pack16(TMP22, DST_3);
1849 vis_st64_2(DST_2, dest, 8);
1850 dest += stride;
44f54ceb 1851
bb270c08
DB
1852 vis_ld64_2(dest, 8, DST_2);
1853 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1854 vis_pmerge(ZERO, REF_S4_1, REF_2);
44f54ceb 1855
bb270c08
DB
1856 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1857 vis_padd16(REF_4, TMP0, TMP8);
44f54ceb 1858
bb270c08
DB
1859 vis_mul8x16au(REF_S6, CONST_256, REF_4);
1860 vis_padd16(REF_6, TMP2, TMP10);
44f54ceb 1861
bb270c08
DB
1862 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
1863 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1864
bb270c08 1865 vis_padd16(TMP10, TMP14, TMP10);
44f54ceb 1866
bb270c08 1867 vis_padd16(TMP8, TMP30, TMP8);
44f54ceb 1868
bb270c08
DB
1869 vis_padd16(TMP10, TMP32, TMP10);
1870 vis_pack16(TMP8, DST_0);
44f54ceb 1871
bb270c08
DB
1872 vis_pack16(TMP10, DST_1);
1873 vis_st64(DST_0, dest[0]);
44f54ceb 1874
bb270c08 1875 vis_padd16(REF_0, TMP4, REF_0);
44f54ceb 1876
bb270c08
DB
1877 vis_mul8x16al(DST_2, CONST_1024, TMP30);
1878 vis_padd16(REF_2, TMP6, REF_2);
44f54ceb 1879
bb270c08
DB
1880 vis_mul8x16al(DST_3, CONST_1024, TMP32);
1881 vis_padd16(REF_0, REF_4, REF_0);
44f54ceb 1882
bb270c08 1883 vis_padd16(REF_2, REF_6, REF_2);
44f54ceb 1884
bb270c08 1885 vis_padd16(REF_0, TMP30, REF_0);
44f54ceb 1886
bb270c08 1887 /* stall */
44f54ceb 1888
bb270c08
DB
1889 vis_padd16(REF_2, TMP32, REF_2);
1890 vis_pack16(REF_0, DST_2);
44f54ceb 1891
bb270c08
DB
1892 vis_pack16(REF_2, DST_3);
1893 vis_st64_2(DST_2, dest, 8);
1894 dest += stride;
1895 } while (--height);
44f54ceb
MN
1896}
1897
86decad6 1898static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 1899 const int stride, int height)
44f54ceb 1900{
bb270c08
DB
1901 unsigned long off = (unsigned long) ref & 0x7;
1902 unsigned long off_plus_1 = off + 1;
1903 int stride_8 = stride + 8;
44f54ceb 1904
bb270c08 1905 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1906
bb270c08 1907 ref = vis_alignaddr(ref);
44f54ceb 1908
bb270c08
DB
1909 vis_ld64(ref[0], TMP0);
1910 vis_fzero(ZERO);
44f54ceb 1911
bb270c08 1912 vis_ld64_2(ref, 8, TMP2);
44f54ceb 1913
bb270c08 1914 vis_ld64(constants6[0], CONST_6);
44f54ceb 1915
bb270c08
DB
1916 vis_ld64(constants256_1024[0], CONST_256);
1917 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1918
bb270c08
DB
1919 if (off != 0x7) {
1920 vis_alignaddr_g0((void *)off_plus_1);
1921 vis_faligndata(TMP0, TMP2, REF_S2);
1922 } else {
1923 vis_src1(TMP2, REF_S2);
1924 }
44f54ceb 1925
bb270c08
DB
1926 height >>= 1;
1927 do { /* 31 cycles */
1928 vis_ld64_2(ref, stride, TMP0);
1929 vis_mul8x16au(REF_S0, CONST_256, TMP8);
1930 vis_pmerge(ZERO, REF_S0_1, TMP10);
44f54ceb 1931
bb270c08
DB
1932 vis_ld64_2(ref, stride_8, TMP2);
1933 ref += stride;
1934 vis_mul8x16au(REF_S2, CONST_256, TMP12);
1935 vis_pmerge(ZERO, REF_S2_1, TMP14);
44f54ceb 1936
bb270c08 1937 vis_alignaddr_g0((void *)off);
44f54ceb 1938
bb270c08
DB
1939 vis_ld64_2(ref, stride, TMP4);
1940 vis_faligndata(TMP0, TMP2, REF_S4);
44f54ceb 1941
bb270c08
DB
1942 vis_ld64_2(ref, stride_8, TMP6);
1943 ref += stride;
44f54ceb 1944
bb270c08
DB
1945 vis_ld64(dest[0], DST_0);
1946 vis_faligndata(TMP4, TMP6, REF_S0);
44f54ceb 1947
bb270c08 1948 vis_ld64_2(dest, stride, DST_2);
44f54ceb 1949
bb270c08
DB
1950 if (off != 0x7) {
1951 vis_alignaddr_g0((void *)off_plus_1);
1952 vis_faligndata(TMP0, TMP2, REF_S6);
1953 vis_faligndata(TMP4, TMP6, REF_S2);
1954 } else {
1955 vis_src1(TMP2, REF_S6);
1956 vis_src1(TMP6, REF_S2);
1957 }
44f54ceb 1958
bb270c08
DB
1959 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1960 vis_pmerge(ZERO, REF_S4, TMP22);
44f54ceb 1961
bb270c08
DB
1962 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1963 vis_pmerge(ZERO, REF_S4_1, TMP24);
44f54ceb 1964
bb270c08
DB
1965 vis_mul8x16au(REF_S6, CONST_256, TMP26);
1966 vis_pmerge(ZERO, REF_S6_1, TMP28);
44f54ceb 1967
bb270c08
DB
1968 vis_mul8x16au(REF_S0, CONST_256, REF_S4);
1969 vis_padd16(TMP22, CONST_6, TMP22);
44f54ceb 1970
bb270c08
DB
1971 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
1972 vis_padd16(TMP24, CONST_6, TMP24);
44f54ceb 1973
bb270c08
DB
1974 vis_mul8x16al(DST_2, CONST_1024, REF_0);
1975 vis_padd16(TMP22, TMP26, TMP22);
44f54ceb 1976
bb270c08
DB
1977 vis_mul8x16al(DST_3, CONST_1024, REF_2);
1978 vis_padd16(TMP24, TMP28, TMP24);
44f54ceb 1979
bb270c08
DB
1980 vis_mul8x16au(REF_S2, CONST_256, TMP26);
1981 vis_padd16(TMP8, TMP22, TMP8);
44f54ceb 1982
bb270c08
DB
1983 vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
1984 vis_padd16(TMP10, TMP24, TMP10);
44f54ceb 1985
bb270c08 1986 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1987
bb270c08 1988 vis_padd16(TMP10, TMP14, TMP10);
44f54ceb 1989
bb270c08 1990 vis_padd16(TMP8, TMP30, TMP8);
44f54ceb 1991
bb270c08
DB
1992 vis_padd16(TMP10, TMP32, TMP10);
1993 vis_pack16(TMP8, DST_0);
44f54ceb 1994
bb270c08
DB
1995 vis_pack16(TMP10, DST_1);
1996 vis_st64(DST_0, dest[0]);
1997 dest += stride;
44f54ceb 1998
bb270c08 1999 vis_padd16(REF_S4, TMP22, TMP12);
44f54ceb 2000
bb270c08 2001 vis_padd16(REF_S6, TMP24, TMP14);
44f54ceb 2002
bb270c08 2003 vis_padd16(TMP12, TMP26, TMP12);
44f54ceb 2004
bb270c08 2005 vis_padd16(TMP14, TMP28, TMP14);
44f54ceb 2006
bb270c08 2007 vis_padd16(TMP12, REF_0, TMP12);
44f54ceb 2008
bb270c08
DB
2009 vis_padd16(TMP14, REF_2, TMP14);
2010 vis_pack16(TMP12, DST_2);
44f54ceb 2011
bb270c08
DB
2012 vis_pack16(TMP14, DST_3);
2013 vis_st64(DST_2, dest[0]);
2014 dest += stride;
2015 } while (--height);
44f54ceb
MN
2016}
2017
2018/* End of rounding code */
2019
2020/* Start of no rounding code */
2021/* The trick used in some of this file is the formula from the MMX
2022 * motion comp code, which is:
2023 *
2024 * (x+y)>>1 == (x&y)+((x^y)>>1)
2025 *
2026 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
2027 * We avoid overflows by masking before we do the shift, and we
2028 * implement the shift by multiplying by 1/2 using mul8x16. So in
2029 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
2030 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
2031 * the value 0x80808080 is in f8):
2032 *
bb270c08
DB
2033 * fxor f0, f2, f10
2034 * fand f10, f4, f10
2035 * fmul8x16 f8, f10, f10
2036 * fand f10, f6, f10
2037 * fand f0, f2, f12
2038 * fpadd16 f12, f10, f10
44f54ceb
MN
2039 */
2040
86decad6 2041static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2042 const int stride, int height)
44f54ceb 2043{
bb270c08
DB
2044 ref = vis_alignaddr(ref);
2045 do { /* 5 cycles */
2046 vis_ld64(ref[0], TMP0);
44f54ceb 2047
bb270c08 2048 vis_ld64_2(ref, 8, TMP2);
44f54ceb 2049
bb270c08
DB
2050 vis_ld64_2(ref, 16, TMP4);
2051 ref += stride;
44f54ceb 2052
bb270c08
DB
2053 vis_faligndata(TMP0, TMP2, REF_0);
2054 vis_st64(REF_0, dest[0]);
44f54ceb 2055
bb270c08
DB
2056 vis_faligndata(TMP2, TMP4, REF_2);
2057 vis_st64_2(REF_2, dest, 8);
2058 dest += stride;
2059 } while (--height);
44f54ceb
MN
2060}
2061
86decad6 2062static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2063 const int stride, int height)
44f54ceb 2064{
bb270c08
DB
2065 ref = vis_alignaddr(ref);
2066 do { /* 4 cycles */
2067 vis_ld64(ref[0], TMP0);
44f54ceb 2068
bb270c08
DB
2069 vis_ld64(ref[8], TMP2);
2070 ref += stride;
44f54ceb 2071
bb270c08 2072 /* stall */
44f54ceb 2073
bb270c08
DB
2074 vis_faligndata(TMP0, TMP2, REF_0);
2075 vis_st64(REF_0, dest[0]);
2076 dest += stride;
2077 } while (--height);
44f54ceb
MN
2078}
2079
2080
86decad6 2081static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2082 const int stride, int height)
44f54ceb 2083{
bb270c08 2084 int stride_8 = stride + 8;
44f54ceb 2085
bb270c08 2086 ref = vis_alignaddr(ref);
44f54ceb 2087
bb270c08 2088 vis_ld64(ref[0], TMP0);
44f54ceb 2089
bb270c08 2090 vis_ld64(ref[8], TMP2);
44f54ceb 2091
bb270c08 2092 vis_ld64(ref[16], TMP4);
44f54ceb 2093
bb270c08 2094 vis_ld64(dest[0], DST_0);
44f54ceb 2095
bb270c08 2096 vis_ld64(dest[8], DST_2);
44f54ceb 2097
bb270c08
DB
2098 vis_ld64(constants_fe[0], MASK_fe);
2099 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2100
bb270c08
DB
2101 vis_ld64(constants_7f[0], MASK_7f);
2102 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 2103
bb270c08 2104 vis_ld64(constants128[0], CONST_128);
44f54ceb 2105
bb270c08
DB
2106 ref += stride;
2107 height = (height >> 1) - 1;
44f54ceb 2108
bb270c08
DB
2109 do { /* 24 cycles */
2110 vis_ld64(ref[0], TMP0);
2111 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 2112
bb270c08
DB
2113 vis_ld64_2(ref, 8, TMP2);
2114 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2115
bb270c08
DB
2116 vis_ld64_2(ref, 16, TMP4);
2117 ref += stride;
2118 vis_mul8x16(CONST_128, TMP6, TMP6);
2119 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 2120
bb270c08 2121 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2122
bb270c08
DB
2123 vis_and(DST_0, REF_0, TMP10);
2124 vis_ld64_2(dest, stride, DST_0);
2125 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 2126
bb270c08
DB
2127 vis_and(DST_2, REF_2, TMP12);
2128 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 2129
bb270c08
DB
2130 vis_ld64(ref[0], TMP14);
2131 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2132
bb270c08 2133 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2134
bb270c08
DB
2135 vis_padd16(TMP10, TMP6, TMP6);
2136 vis_st64(TMP6, dest[0]);
44f54ceb 2137
bb270c08
DB
2138 vis_padd16(TMP12, TMP8, TMP8);
2139 vis_st64_2(TMP8, dest, 8);
44f54ceb 2140
bb270c08
DB
2141 dest += stride;
2142 vis_ld64_2(ref, 8, TMP16);
2143 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2144
bb270c08
DB
2145 vis_ld64_2(ref, 16, TMP18);
2146 vis_faligndata(TMP2, TMP4, REF_2);
2147 ref += stride;
44f54ceb 2148
bb270c08 2149 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 2150
bb270c08 2151 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 2152
bb270c08
DB
2153 vis_xor(DST_2, REF_2, TMP22);
2154 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 2155
bb270c08 2156 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 2157
bb270c08
DB
2158 vis_and(DST_0, REF_0, TMP24);
2159 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 2160
bb270c08 2161 vis_and(DST_2, REF_2, TMP26);
44f54ceb 2162
bb270c08
DB
2163 vis_ld64_2(dest, stride, DST_0);
2164 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 2165
bb270c08
DB
2166 vis_ld64_2(dest, stride_8, DST_2);
2167 vis_faligndata(TMP16, TMP18, REF_2);
44f54ceb 2168
bb270c08 2169 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 2170
bb270c08 2171 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 2172
bb270c08
DB
2173 vis_padd16(TMP24, TMP20, TMP20);
2174 vis_st64(TMP20, dest[0]);
44f54ceb 2175
bb270c08
DB
2176 vis_padd16(TMP26, TMP22, TMP22);
2177 vis_st64_2(TMP22, dest, 8);
2178 dest += stride;
2179 } while (--height);
44f54ceb 2180
bb270c08
DB
2181 vis_ld64(ref[0], TMP0);
2182 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 2183
bb270c08
DB
2184 vis_ld64_2(ref, 8, TMP2);
2185 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2186
bb270c08
DB
2187 vis_ld64_2(ref, 16, TMP4);
2188 vis_mul8x16(CONST_128, TMP6, TMP6);
2189 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 2190
bb270c08 2191 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2192
bb270c08
DB
2193 vis_and(DST_0, REF_0, TMP10);
2194 vis_ld64_2(dest, stride, DST_0);
2195 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 2196
bb270c08
DB
2197 vis_and(DST_2, REF_2, TMP12);
2198 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 2199
bb270c08
DB
2200 vis_ld64(ref[0], TMP14);
2201 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2202
bb270c08 2203 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2204
bb270c08
DB
2205 vis_padd16(TMP10, TMP6, TMP6);
2206 vis_st64(TMP6, dest[0]);
44f54ceb 2207
bb270c08
DB
2208 vis_padd16(TMP12, TMP8, TMP8);
2209 vis_st64_2(TMP8, dest, 8);
44f54ceb 2210
bb270c08
DB
2211 dest += stride;
2212 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2213
bb270c08 2214 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 2215
bb270c08 2216 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 2217
bb270c08 2218 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 2219
bb270c08
DB
2220 vis_xor(DST_2, REF_2, TMP22);
2221 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 2222
bb270c08 2223 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 2224
bb270c08
DB
2225 vis_and(DST_0, REF_0, TMP24);
2226 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 2227
bb270c08 2228 vis_and(DST_2, REF_2, TMP26);
44f54ceb 2229
bb270c08 2230 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 2231
bb270c08 2232 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 2233
bb270c08
DB
2234 vis_padd16(TMP24, TMP20, TMP20);
2235 vis_st64(TMP20, dest[0]);
44f54ceb 2236
bb270c08
DB
2237 vis_padd16(TMP26, TMP22, TMP22);
2238 vis_st64_2(TMP22, dest, 8);
44f54ceb
MN
2239}
2240
86decad6 2241static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2242 const int stride, int height)
44f54ceb 2243{
bb270c08 2244 ref = vis_alignaddr(ref);
44f54ceb 2245
bb270c08 2246 vis_ld64(ref[0], TMP0);
44f54ceb 2247
bb270c08 2248 vis_ld64(ref[8], TMP2);
44f54ceb 2249
bb270c08 2250 vis_ld64(dest[0], DST_0);
44f54ceb 2251
bb270c08 2252 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2253
bb270c08
DB
2254 vis_ld64(constants_7f[0], MASK_7f);
2255 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2256
bb270c08 2257 vis_ld64(constants128[0], CONST_128);
44f54ceb 2258
bb270c08
DB
2259 ref += stride;
2260 height = (height >> 1) - 1;
44f54ceb 2261
bb270c08
DB
2262 do { /* 12 cycles */
2263 vis_ld64(ref[0], TMP0);
2264 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 2265
bb270c08
DB
2266 vis_ld64(ref[8], TMP2);
2267 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2268
bb270c08
DB
2269 vis_and(DST_0, REF_0, TMP6);
2270 vis_ld64_2(dest, stride, DST_0);
2271 ref += stride;
2272 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2273
bb270c08
DB
2274 vis_ld64(ref[0], TMP12);
2275 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2276
bb270c08
DB
2277 vis_ld64(ref[8], TMP2);
2278 vis_xor(DST_0, REF_0, TMP0);
2279 ref += stride;
44f54ceb 2280
bb270c08 2281 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 2282
bb270c08 2283 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2284
bb270c08
DB
2285 vis_padd16(TMP6, TMP4, TMP4);
2286 vis_st64(TMP4, dest[0]);
2287 dest += stride;
2288 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 2289
bb270c08
DB
2290 vis_and(DST_0, REF_0, TMP6);
2291 vis_ld64_2(dest, stride, DST_0);
44f54ceb 2292
bb270c08 2293 vis_faligndata(TMP12, TMP2, REF_0);
44f54ceb 2294
bb270c08 2295 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 2296
bb270c08
DB
2297 vis_padd16(TMP6, TMP0, TMP4);
2298 vis_st64(TMP4, dest[0]);
2299 dest += stride;
2300 } while (--height);
44f54ceb 2301
bb270c08
DB
2302 vis_ld64(ref[0], TMP0);
2303 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 2304
bb270c08
DB
2305 vis_ld64(ref[8], TMP2);
2306 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2307
bb270c08
DB
2308 vis_and(DST_0, REF_0, TMP6);
2309 vis_ld64_2(dest, stride, DST_0);
2310 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2311
bb270c08 2312 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2313
bb270c08 2314 vis_xor(DST_0, REF_0, TMP0);
44f54ceb 2315
bb270c08 2316 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 2317
bb270c08 2318 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2319
bb270c08
DB
2320 vis_padd16(TMP6, TMP4, TMP4);
2321 vis_st64(TMP4, dest[0]);
2322 dest += stride;
2323 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 2324
bb270c08 2325 vis_and(DST_0, REF_0, TMP6);
44f54ceb 2326
bb270c08 2327 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 2328
bb270c08
DB
2329 vis_padd16(TMP6, TMP0, TMP4);
2330 vis_st64(TMP4, dest[0]);
44f54ceb
MN
2331}
2332
86decad6 2333static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2334 const int stride, int height)
44f54ceb 2335{
bb270c08
DB
2336 unsigned long off = (unsigned long) ref & 0x7;
2337 unsigned long off_plus_1 = off + 1;
44f54ceb 2338
bb270c08 2339 ref = vis_alignaddr(ref);
44f54ceb 2340
bb270c08 2341 vis_ld64(ref[0], TMP0);
44f54ceb 2342
bb270c08 2343 vis_ld64_2(ref, 8, TMP2);
44f54ceb 2344
bb270c08 2345 vis_ld64_2(ref, 16, TMP4);
44f54ceb 2346
bb270c08 2347 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2348
bb270c08
DB
2349 vis_ld64(constants_7f[0], MASK_7f);
2350 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2351
bb270c08
DB
2352 vis_ld64(constants128[0], CONST_128);
2353 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2354
bb270c08
DB
2355 if (off != 0x7) {
2356 vis_alignaddr_g0((void *)off_plus_1);
2357 vis_faligndata(TMP0, TMP2, REF_2);
2358 vis_faligndata(TMP2, TMP4, REF_6);
2359 } else {
2360 vis_src1(TMP2, REF_2);
2361 vis_src1(TMP4, REF_6);
2362 }
44f54ceb 2363
bb270c08
DB
2364 ref += stride;
2365 height = (height >> 1) - 1;
44f54ceb 2366
bb270c08
DB
2367 do { /* 34 cycles */
2368 vis_ld64(ref[0], TMP0);
2369 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2370
bb270c08
DB
2371 vis_ld64_2(ref, 8, TMP2);
2372 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2373
bb270c08
DB
2374 vis_ld64_2(ref, 16, TMP4);
2375 vis_and(TMP6, MASK_fe, TMP6);
2376 ref += stride;
44f54ceb 2377
bb270c08
DB
2378 vis_ld64(ref[0], TMP14);
2379 vis_mul8x16(CONST_128, TMP6, TMP6);
2380 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2381
bb270c08
DB
2382 vis_ld64_2(ref, 8, TMP16);
2383 vis_mul8x16(CONST_128, TMP8, TMP8);
2384 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2385
bb270c08
DB
2386 vis_ld64_2(ref, 16, TMP18);
2387 ref += stride;
2388 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2389
bb270c08 2390 vis_alignaddr_g0((void *)off);
44f54ceb 2391
bb270c08 2392 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2393
bb270c08 2394 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2395
bb270c08
DB
2396 if (off != 0x7) {
2397 vis_alignaddr_g0((void *)off_plus_1);
2398 vis_faligndata(TMP0, TMP2, REF_2);
2399 vis_faligndata(TMP2, TMP4, REF_6);
2400 } else {
2401 vis_src1(TMP2, REF_2);
2402 vis_src1(TMP4, REF_6);
2403 }
44f54ceb 2404
bb270c08 2405 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2406
bb270c08 2407 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2408
bb270c08
DB
2409 vis_padd16(TMP10, TMP6, TMP6);
2410 vis_st64(TMP6, dest[0]);
44f54ceb 2411
bb270c08
DB
2412 vis_padd16(TMP12, TMP8, TMP8);
2413 vis_st64_2(TMP8, dest, 8);
2414 dest += stride;
44f54ceb 2415
bb270c08 2416 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2417
bb270c08 2418 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2419
bb270c08 2420 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2421
bb270c08
DB
2422 vis_mul8x16(CONST_128, TMP6, TMP6);
2423 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2424
bb270c08
DB
2425 vis_mul8x16(CONST_128, TMP8, TMP8);
2426 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2427
bb270c08 2428 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2429
bb270c08 2430 vis_alignaddr_g0((void *)off);
44f54ceb 2431
bb270c08 2432 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 2433
bb270c08 2434 vis_faligndata(TMP16, TMP18, REF_4);
44f54ceb 2435
bb270c08
DB
2436 if (off != 0x7) {
2437 vis_alignaddr_g0((void *)off_plus_1);
2438 vis_faligndata(TMP14, TMP16, REF_2);
2439 vis_faligndata(TMP16, TMP18, REF_6);
2440 } else {
2441 vis_src1(TMP16, REF_2);
2442 vis_src1(TMP18, REF_6);
2443 }
44f54ceb 2444
bb270c08 2445 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2446
bb270c08 2447 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2448
bb270c08
DB
2449 vis_padd16(TMP10, TMP6, TMP6);
2450 vis_st64(TMP6, dest[0]);
44f54ceb 2451
bb270c08
DB
2452 vis_padd16(TMP12, TMP8, TMP8);
2453 vis_st64_2(TMP8, dest, 8);
2454 dest += stride;
2455 } while (--height);
44f54ceb 2456
bb270c08
DB
2457 vis_ld64(ref[0], TMP0);
2458 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2459
bb270c08
DB
2460 vis_ld64_2(ref, 8, TMP2);
2461 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2462
bb270c08
DB
2463 vis_ld64_2(ref, 16, TMP4);
2464 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2465
bb270c08
DB
2466 vis_mul8x16(CONST_128, TMP6, TMP6);
2467 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2468
bb270c08
DB
2469 vis_mul8x16(CONST_128, TMP8, TMP8);
2470 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2471
bb270c08 2472 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2473
bb270c08 2474 vis_alignaddr_g0((void *)off);
44f54ceb 2475
bb270c08 2476 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2477
bb270c08 2478 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2479
bb270c08
DB
2480 if (off != 0x7) {
2481 vis_alignaddr_g0((void *)off_plus_1);
2482 vis_faligndata(TMP0, TMP2, REF_2);
2483 vis_faligndata(TMP2, TMP4, REF_6);
2484 } else {
2485 vis_src1(TMP2, REF_2);
2486 vis_src1(TMP4, REF_6);
2487 }
44f54ceb 2488
bb270c08 2489 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2490
bb270c08 2491 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2492
bb270c08
DB
2493 vis_padd16(TMP10, TMP6, TMP6);
2494 vis_st64(TMP6, dest[0]);
44f54ceb 2495
bb270c08
DB
2496 vis_padd16(TMP12, TMP8, TMP8);
2497 vis_st64_2(TMP8, dest, 8);
2498 dest += stride;
44f54ceb 2499
bb270c08 2500 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2501
bb270c08 2502 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2503
bb270c08 2504 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2505
bb270c08
DB
2506 vis_mul8x16(CONST_128, TMP6, TMP6);
2507 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2508
bb270c08
DB
2509 vis_mul8x16(CONST_128, TMP8, TMP8);
2510 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2511
bb270c08 2512 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2513
bb270c08 2514 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2515
bb270c08 2516 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2517
bb270c08
DB
2518 vis_padd16(TMP10, TMP6, TMP6);
2519 vis_st64(TMP6, dest[0]);
44f54ceb 2520
bb270c08
DB
2521 vis_padd16(TMP12, TMP8, TMP8);
2522 vis_st64_2(TMP8, dest, 8);
44f54ceb
MN
2523}
2524
86decad6 2525static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2526 const int stride, int height)
44f54ceb 2527{
bb270c08
DB
2528 unsigned long off = (unsigned long) ref & 0x7;
2529 unsigned long off_plus_1 = off + 1;
44f54ceb 2530
bb270c08 2531 ref = vis_alignaddr(ref);
44f54ceb 2532
bb270c08 2533 vis_ld64(ref[0], TMP0);
44f54ceb 2534
bb270c08 2535 vis_ld64(ref[8], TMP2);
44f54ceb 2536
bb270c08 2537 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2538
bb270c08 2539 vis_ld64(constants_7f[0], MASK_7f);
44f54ceb 2540
bb270c08
DB
2541 vis_ld64(constants128[0], CONST_128);
2542 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2543
bb270c08
DB
2544 if (off != 0x7) {
2545 vis_alignaddr_g0((void *)off_plus_1);
2546 vis_faligndata(TMP0, TMP2, REF_2);
2547 } else {
2548 vis_src1(TMP2, REF_2);
2549 }
44f54ceb 2550
bb270c08
DB
2551 ref += stride;
2552 height = (height >> 1) - 1;
44f54ceb 2553
bb270c08
DB
2554 do { /* 20 cycles */
2555 vis_ld64(ref[0], TMP0);
2556 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 2557
bb270c08
DB
2558 vis_ld64_2(ref, 8, TMP2);
2559 vis_and(TMP4, MASK_fe, TMP4);
2560 ref += stride;
44f54ceb 2561
bb270c08
DB
2562 vis_ld64(ref[0], TMP8);
2563 vis_and(REF_0, REF_2, TMP6);
2564 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2565
bb270c08 2566 vis_alignaddr_g0((void *)off);
44f54ceb 2567
bb270c08
DB
2568 vis_ld64_2(ref, 8, TMP10);
2569 ref += stride;
2570 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2571
bb270c08
DB
2572 if (off != 0x7) {
2573 vis_alignaddr_g0((void *)off_plus_1);
2574 vis_faligndata(TMP0, TMP2, REF_2);
2575 } else {
2576 vis_src1(TMP2, REF_2);
2577 }
44f54ceb 2578
bb270c08 2579 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2580
bb270c08
DB
2581 vis_padd16(TMP6, TMP4, DST_0);
2582 vis_st64(DST_0, dest[0]);
2583 dest += stride;
44f54ceb 2584
bb270c08 2585 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 2586
bb270c08 2587 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 2588
bb270c08
DB
2589 vis_and(REF_0, REF_2, TMP14);
2590 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 2591
bb270c08
DB
2592 vis_alignaddr_g0((void *)off);
2593 vis_faligndata(TMP8, TMP10, REF_0);
2594 if (off != 0x7) {
2595 vis_alignaddr_g0((void *)off_plus_1);
2596 vis_faligndata(TMP8, TMP10, REF_2);
2597 } else {
2598 vis_src1(TMP10, REF_2);
2599 }
44f54ceb 2600
bb270c08 2601 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 2602
bb270c08
DB
2603 vis_padd16(TMP14, TMP12, DST_0);
2604 vis_st64(DST_0, dest[0]);
2605 dest += stride;
2606 } while (--height);
44f54ceb 2607
bb270c08
DB
2608 vis_ld64(ref[0], TMP0);
2609 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 2610
bb270c08
DB
2611 vis_ld64_2(ref, 8, TMP2);
2612 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2613
bb270c08
DB
2614 vis_and(REF_0, REF_2, TMP6);
2615 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2616
bb270c08 2617 vis_alignaddr_g0((void *)off);
44f54ceb 2618
bb270c08 2619 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2620
bb270c08
DB
2621 if (off != 0x7) {
2622 vis_alignaddr_g0((void *)off_plus_1);
2623 vis_faligndata(TMP0, TMP2, REF_2);
2624 } else {
2625 vis_src1(TMP2, REF_2);
2626 }
44f54ceb 2627
bb270c08 2628 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2629
bb270c08
DB
2630 vis_padd16(TMP6, TMP4, DST_0);
2631 vis_st64(DST_0, dest[0]);
2632 dest += stride;
44f54ceb 2633
bb270c08 2634 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 2635
bb270c08 2636 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 2637
bb270c08
DB
2638 vis_and(REF_0, REF_2, TMP14);
2639 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 2640
bb270c08 2641 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 2642
bb270c08
DB
2643 vis_padd16(TMP14, TMP12, DST_0);
2644 vis_st64(DST_0, dest[0]);
2645 dest += stride;
44f54ceb
MN
2646}
2647
86decad6 2648static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2649 const int stride, int height)
44f54ceb 2650{
bb270c08
DB
2651 unsigned long off = (unsigned long) ref & 0x7;
2652 unsigned long off_plus_1 = off + 1;
44f54ceb 2653
bb270c08 2654 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 2655
bb270c08
DB
2656 vis_ld64(constants3[0], CONST_3);
2657 vis_fzero(ZERO);
2658 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 2659
bb270c08
DB
2660 ref = vis_alignaddr(ref);
2661 do { /* 26 cycles */
2662 vis_ld64(ref[0], TMP0);
44f54ceb 2663
bb270c08 2664 vis_ld64(ref[8], TMP2);
44f54ceb 2665
bb270c08 2666 vis_alignaddr_g0((void *)off);
44f54ceb 2667
bb270c08 2668 vis_ld64(ref[16], TMP4);
44f54ceb 2669
bb270c08
DB
2670 vis_ld64(dest[0], DST_0);
2671 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2672
bb270c08
DB
2673 vis_ld64(dest[8], DST_2);
2674 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2675
bb270c08
DB
2676 if (off != 0x7) {
2677 vis_alignaddr_g0((void *)off_plus_1);
2678 vis_faligndata(TMP0, TMP2, REF_2);
2679 vis_faligndata(TMP2, TMP4, REF_6);
2680 } else {
2681 vis_src1(TMP2, REF_2);
2682 vis_src1(TMP4, REF_6);
2683 }
44f54ceb 2684
bb270c08 2685 vis_mul8x16au(REF_0, CONST_256, TMP0);
44f54ceb 2686
bb270c08
DB
2687 vis_pmerge(ZERO, REF_2, TMP4);
2688 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 2689
bb270c08 2690 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 2691
bb270c08 2692 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 2693
bb270c08
DB
2694 vis_mul8x16al(DST_0, CONST_512, TMP4);
2695 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 2696
bb270c08 2697 vis_mul8x16al(DST_1, CONST_512, TMP6);
44f54ceb 2698
bb270c08 2699 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 2700
bb270c08
DB
2701 vis_padd16(TMP0, TMP4, TMP0);
2702 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 2703
bb270c08
DB
2704 vis_padd16(TMP2, TMP6, TMP2);
2705 vis_mul8x16au(REF_4, CONST_256, TMP16);
44f54ceb 2706
bb270c08
DB
2707 vis_padd16(TMP0, CONST_3, TMP8);
2708 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
44f54ceb 2709
bb270c08
DB
2710 vis_padd16(TMP2, CONST_3, TMP10);
2711 vis_pack16(TMP8, DST_0);
44f54ceb 2712
bb270c08
DB
2713 vis_pack16(TMP10, DST_1);
2714 vis_padd16(TMP16, TMP12, TMP0);
44f54ceb 2715
bb270c08
DB
2716 vis_st64(DST_0, dest[0]);
2717 vis_mul8x16al(DST_2, CONST_512, TMP4);
2718 vis_padd16(TMP18, TMP14, TMP2);
44f54ceb 2719
bb270c08
DB
2720 vis_mul8x16al(DST_3, CONST_512, TMP6);
2721 vis_padd16(TMP0, CONST_3, TMP0);
44f54ceb 2722
bb270c08 2723 vis_padd16(TMP2, CONST_3, TMP2);
44f54ceb 2724
bb270c08 2725 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 2726
bb270c08
DB
2727 vis_padd16(TMP2, TMP6, TMP2);
2728 vis_pack16(TMP0, DST_2);
44f54ceb 2729
bb270c08
DB
2730 vis_pack16(TMP2, DST_3);
2731 vis_st64(DST_2, dest[8]);
44f54ceb 2732
bb270c08
DB
2733 ref += stride;
2734 dest += stride;
2735 } while (--height);
44f54ceb
MN
2736}
2737
86decad6 2738static void MC_avg_no_round_x_8_vis (uint8_t * dest, const uint8_t * ref,
bb270c08 2739 const int stride, int height)
44f54ceb 2740{
bb270c08
DB
2741 unsigned long off = (unsigned long) ref & 0x7;
2742 unsigned long off_plus_1 = off + 1;
2743 int stride_times_2 = stride << 1;
44f54ceb 2744
bb270c08 2745 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 2746
bb270c08
DB
2747 vis_ld64(constants3[0], CONST_3);