sparc: fix dsputil prototypes
[libav.git] / libavcodec / sparc / dsputil_vis.c
CommitLineData
44f54ceb 1/*
44f54ceb
MN
2 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
3 *
b78e7197 4 * This file is part of FFmpeg.
44f54ceb 5 *
a33fe572
DB
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
44f54ceb 10 *
b78e7197 11 * FFmpeg is distributed in the hope that it will be useful,
44f54ceb 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
a33fe572
DB
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
44f54ceb 15 *
a33fe572
DB
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
5509bffa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
44f54ceb
MN
19 */
20
0f12310f 21/* The *no_round* functions have been added by James A. Morrison, 2003,2004.
44f54ceb 22 The vis code from libmpeg2 was adapted for ffmpeg by James A. Morrison.
44f54ceb
MN
23 */
24
25#include "config.h"
26
44f54ceb
MN
27#include <inttypes.h>
28
245976da 29#include "libavcodec/dsputil.h"
ad403802 30#include "dsputil_vis.h"
44f54ceb
MN
31
32#include "vis.h"
33
34/* The trick used in some of this file is the formula from the MMX
35 * motion comp code, which is:
36 *
37 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
38 *
39 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
40 * We avoid overflows by masking before we do the shift, and we
41 * implement the shift by multiplying by 1/2 using mul8x16. So in
42 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
43 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
44 * the value 0x80808080 is in f8):
45 *
bb270c08
DB
46 * fxor f0, f2, f10
47 * fand f10, f4, f10
48 * fmul8x16 f8, f10, f10
49 * fand f10, f6, f10
50 * for f0, f2, f12
51 * fpsub16 f12, f10, f10
44f54ceb
MN
52 */
53
54#define ATTR_ALIGN(alignd) __attribute__ ((aligned(alignd)))
55
56#define DUP4(x) {x, x, x, x}
57#define DUP8(x) {x, x, x, x, x, x, x, x}
58static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
59static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
60static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
61static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
62static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
63static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
64static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
65static const int16_t constants256_512[] ATTR_ALIGN(8) =
bb270c08 66 {256, 512, 256, 512};
44f54ceb 67static const int16_t constants256_1024[] ATTR_ALIGN(8) =
bb270c08
DB
68 {256, 1024, 256, 1024};
69
70#define REF_0 0
71#define REF_0_1 1
72#define REF_2 2
73#define REF_2_1 3
74#define REF_4 4
75#define REF_4_1 5
76#define REF_6 6
77#define REF_6_1 7
78#define REF_S0 8
79#define REF_S0_1 9
80#define REF_S2 10
81#define REF_S2_1 11
82#define REF_S4 12
83#define REF_S4_1 13
84#define REF_S6 14
85#define REF_S6_1 15
86#define DST_0 16
87#define DST_1 17
88#define DST_2 18
89#define DST_3 19
90#define CONST_1 20
91#define CONST_2 20
92#define CONST_3 20
93#define CONST_6 20
94#define MASK_fe 20
95#define CONST_128 22
96#define CONST_256 22
97#define CONST_512 22
98#define CONST_1024 22
99#define TMP0 24
100#define TMP1 25
101#define TMP2 26
102#define TMP3 27
103#define TMP4 28
104#define TMP5 29
105#define ZERO 30
106#define MASK_7f 30
107
108#define TMP6 32
109#define TMP8 34
110#define TMP10 36
111#define TMP12 38
112#define TMP14 40
113#define TMP16 42
114#define TMP18 44
115#define TMP20 46
116#define TMP22 48
117#define TMP24 50
118#define TMP26 52
119#define TMP28 54
120#define TMP30 56
121#define TMP32 58
44f54ceb
MN
122
123static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 124 const int stride, int height)
44f54ceb 125{
bb270c08 126 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 127
bb270c08
DB
128 ref = vis_alignaddr(ref);
129 do { /* 5 cycles */
130 vis_ld64(ref[0], TMP0);
44f54ceb 131
bb270c08 132 vis_ld64_2(ref, 8, TMP2);
44f54ceb 133
bb270c08
DB
134 vis_ld64_2(ref, 16, TMP4);
135 ref += stride;
44f54ceb 136
bb270c08
DB
137 vis_faligndata(TMP0, TMP2, REF_0);
138 vis_st64(REF_0, dest[0]);
44f54ceb 139
bb270c08
DB
140 vis_faligndata(TMP2, TMP4, REF_2);
141 vis_st64_2(REF_2, dest, 8);
142 dest += stride;
143 } while (--height);
44f54ceb
MN
144}
145
146static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 147 const int stride, int height)
44f54ceb 148{
bb270c08 149 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 150
bb270c08
DB
151 ref = vis_alignaddr(ref);
152 do { /* 4 cycles */
153 vis_ld64(ref[0], TMP0);
44f54ceb 154
bb270c08
DB
155 vis_ld64(ref[8], TMP2);
156 ref += stride;
44f54ceb 157
bb270c08 158 /* stall */
44f54ceb 159
bb270c08
DB
160 vis_faligndata(TMP0, TMP2, REF_0);
161 vis_st64(REF_0, dest[0]);
162 dest += stride;
163 } while (--height);
44f54ceb
MN
164}
165
166
167static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 168 const int stride, int height)
44f54ceb 169{
bb270c08
DB
170 uint8_t *ref = (uint8_t *) _ref;
171 int stride_8 = stride + 8;
44f54ceb 172
bb270c08 173 ref = vis_alignaddr(ref);
44f54ceb 174
bb270c08 175 vis_ld64(ref[0], TMP0);
44f54ceb 176
bb270c08 177 vis_ld64(ref[8], TMP2);
44f54ceb 178
bb270c08 179 vis_ld64(ref[16], TMP4);
44f54ceb 180
bb270c08 181 vis_ld64(dest[0], DST_0);
44f54ceb 182
bb270c08 183 vis_ld64(dest[8], DST_2);
44f54ceb 184
bb270c08
DB
185 vis_ld64(constants_fe[0], MASK_fe);
186 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 187
bb270c08
DB
188 vis_ld64(constants_7f[0], MASK_7f);
189 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 190
bb270c08 191 vis_ld64(constants128[0], CONST_128);
44f54ceb 192
bb270c08
DB
193 ref += stride;
194 height = (height >> 1) - 1;
44f54ceb 195
bb270c08
DB
196 do { /* 24 cycles */
197 vis_ld64(ref[0], TMP0);
198 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 199
bb270c08
DB
200 vis_ld64_2(ref, 8, TMP2);
201 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 202
bb270c08
DB
203 vis_ld64_2(ref, 16, TMP4);
204 ref += stride;
205 vis_mul8x16(CONST_128, TMP6, TMP6);
206 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 207
bb270c08 208 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 209
bb270c08
DB
210 vis_or(DST_0, REF_0, TMP10);
211 vis_ld64_2(dest, stride, DST_0);
212 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 213
bb270c08
DB
214 vis_or(DST_2, REF_2, TMP12);
215 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 216
bb270c08
DB
217 vis_ld64(ref[0], TMP14);
218 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 219
bb270c08 220 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 221
bb270c08
DB
222 vis_psub16(TMP10, TMP6, TMP6);
223 vis_st64(TMP6, dest[0]);
44f54ceb 224
bb270c08
DB
225 vis_psub16(TMP12, TMP8, TMP8);
226 vis_st64_2(TMP8, dest, 8);
44f54ceb 227
bb270c08
DB
228 dest += stride;
229 vis_ld64_2(ref, 8, TMP16);
230 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 231
bb270c08
DB
232 vis_ld64_2(ref, 16, TMP18);
233 vis_faligndata(TMP2, TMP4, REF_2);
234 ref += stride;
44f54ceb 235
bb270c08 236 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 237
bb270c08 238 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 239
bb270c08
DB
240 vis_xor(DST_2, REF_2, TMP22);
241 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 242
bb270c08 243 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 244
bb270c08
DB
245 vis_or(DST_0, REF_0, TMP24);
246 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 247
bb270c08 248 vis_or(DST_2, REF_2, TMP26);
44f54ceb 249
bb270c08
DB
250 vis_ld64_2(dest, stride, DST_0);
251 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 252
bb270c08
DB
253 vis_ld64_2(dest, stride_8, DST_2);
254 vis_faligndata(TMP16, TMP18, REF_2);
44f54ceb 255
bb270c08 256 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 257
bb270c08 258 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 259
bb270c08
DB
260 vis_psub16(TMP24, TMP20, TMP20);
261 vis_st64(TMP20, dest[0]);
44f54ceb 262
bb270c08
DB
263 vis_psub16(TMP26, TMP22, TMP22);
264 vis_st64_2(TMP22, dest, 8);
265 dest += stride;
266 } while (--height);
44f54ceb 267
bb270c08
DB
268 vis_ld64(ref[0], TMP0);
269 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 270
bb270c08
DB
271 vis_ld64_2(ref, 8, TMP2);
272 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 273
bb270c08
DB
274 vis_ld64_2(ref, 16, TMP4);
275 vis_mul8x16(CONST_128, TMP6, TMP6);
276 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 277
bb270c08 278 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 279
bb270c08
DB
280 vis_or(DST_0, REF_0, TMP10);
281 vis_ld64_2(dest, stride, DST_0);
282 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 283
bb270c08
DB
284 vis_or(DST_2, REF_2, TMP12);
285 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 286
bb270c08
DB
287 vis_ld64(ref[0], TMP14);
288 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 289
bb270c08 290 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 291
bb270c08
DB
292 vis_psub16(TMP10, TMP6, TMP6);
293 vis_st64(TMP6, dest[0]);
44f54ceb 294
bb270c08
DB
295 vis_psub16(TMP12, TMP8, TMP8);
296 vis_st64_2(TMP8, dest, 8);
44f54ceb 297
bb270c08
DB
298 dest += stride;
299 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 300
bb270c08 301 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 302
bb270c08 303 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 304
bb270c08 305 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 306
bb270c08
DB
307 vis_xor(DST_2, REF_2, TMP22);
308 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 309
bb270c08 310 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 311
bb270c08
DB
312 vis_or(DST_0, REF_0, TMP24);
313 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 314
bb270c08 315 vis_or(DST_2, REF_2, TMP26);
44f54ceb 316
bb270c08 317 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 318
bb270c08 319 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 320
bb270c08
DB
321 vis_psub16(TMP24, TMP20, TMP20);
322 vis_st64(TMP20, dest[0]);
44f54ceb 323
bb270c08
DB
324 vis_psub16(TMP26, TMP22, TMP22);
325 vis_st64_2(TMP22, dest, 8);
44f54ceb
MN
326}
327
328static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 329 const int stride, int height)
44f54ceb 330{
bb270c08 331 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 332
bb270c08 333 ref = vis_alignaddr(ref);
44f54ceb 334
bb270c08 335 vis_ld64(ref[0], TMP0);
44f54ceb 336
bb270c08 337 vis_ld64(ref[8], TMP2);
44f54ceb 338
bb270c08 339 vis_ld64(dest[0], DST_0);
44f54ceb 340
bb270c08 341 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 342
bb270c08
DB
343 vis_ld64(constants_7f[0], MASK_7f);
344 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 345
bb270c08 346 vis_ld64(constants128[0], CONST_128);
44f54ceb 347
bb270c08
DB
348 ref += stride;
349 height = (height >> 1) - 1;
44f54ceb 350
bb270c08
DB
351 do { /* 12 cycles */
352 vis_ld64(ref[0], TMP0);
353 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 354
bb270c08
DB
355 vis_ld64(ref[8], TMP2);
356 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 357
bb270c08
DB
358 vis_or(DST_0, REF_0, TMP6);
359 vis_ld64_2(dest, stride, DST_0);
360 ref += stride;
361 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 362
bb270c08
DB
363 vis_ld64(ref[0], TMP12);
364 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 365
bb270c08
DB
366 vis_ld64(ref[8], TMP2);
367 vis_xor(DST_0, REF_0, TMP0);
368 ref += stride;
44f54ceb 369
bb270c08 370 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 371
bb270c08 372 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 373
bb270c08
DB
374 vis_psub16(TMP6, TMP4, TMP4);
375 vis_st64(TMP4, dest[0]);
376 dest += stride;
377 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 378
bb270c08
DB
379 vis_or(DST_0, REF_0, TMP6);
380 vis_ld64_2(dest, stride, DST_0);
44f54ceb 381
bb270c08 382 vis_faligndata(TMP12, TMP2, REF_0);
44f54ceb 383
bb270c08 384 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 385
bb270c08
DB
386 vis_psub16(TMP6, TMP0, TMP4);
387 vis_st64(TMP4, dest[0]);
388 dest += stride;
389 } while (--height);
44f54ceb 390
bb270c08
DB
391 vis_ld64(ref[0], TMP0);
392 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 393
bb270c08
DB
394 vis_ld64(ref[8], TMP2);
395 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 396
bb270c08
DB
397 vis_or(DST_0, REF_0, TMP6);
398 vis_ld64_2(dest, stride, DST_0);
399 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 400
bb270c08 401 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 402
bb270c08 403 vis_xor(DST_0, REF_0, TMP0);
44f54ceb 404
bb270c08 405 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 406
bb270c08 407 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 408
bb270c08
DB
409 vis_psub16(TMP6, TMP4, TMP4);
410 vis_st64(TMP4, dest[0]);
411 dest += stride;
412 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 413
bb270c08 414 vis_or(DST_0, REF_0, TMP6);
44f54ceb 415
bb270c08 416 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 417
bb270c08
DB
418 vis_psub16(TMP6, TMP0, TMP4);
419 vis_st64(TMP4, dest[0]);
44f54ceb
MN
420}
421
422static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 423 const int stride, int height)
44f54ceb 424{
bb270c08
DB
425 uint8_t *ref = (uint8_t *) _ref;
426 unsigned long off = (unsigned long) ref & 0x7;
427 unsigned long off_plus_1 = off + 1;
44f54ceb 428
bb270c08 429 ref = vis_alignaddr(ref);
44f54ceb 430
bb270c08 431 vis_ld64(ref[0], TMP0);
44f54ceb 432
bb270c08 433 vis_ld64_2(ref, 8, TMP2);
44f54ceb 434
bb270c08 435 vis_ld64_2(ref, 16, TMP4);
44f54ceb 436
bb270c08 437 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 438
bb270c08
DB
439 vis_ld64(constants_7f[0], MASK_7f);
440 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 441
bb270c08
DB
442 vis_ld64(constants128[0], CONST_128);
443 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 444
bb270c08
DB
445 if (off != 0x7) {
446 vis_alignaddr_g0((void *)off_plus_1);
447 vis_faligndata(TMP0, TMP2, REF_2);
448 vis_faligndata(TMP2, TMP4, REF_6);
449 } else {
450 vis_src1(TMP2, REF_2);
451 vis_src1(TMP4, REF_6);
452 }
44f54ceb 453
bb270c08
DB
454 ref += stride;
455 height = (height >> 1) - 1;
44f54ceb 456
bb270c08
DB
457 do { /* 34 cycles */
458 vis_ld64(ref[0], TMP0);
459 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 460
bb270c08
DB
461 vis_ld64_2(ref, 8, TMP2);
462 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 463
bb270c08
DB
464 vis_ld64_2(ref, 16, TMP4);
465 vis_and(TMP6, MASK_fe, TMP6);
466 ref += stride;
44f54ceb 467
bb270c08
DB
468 vis_ld64(ref[0], TMP14);
469 vis_mul8x16(CONST_128, TMP6, TMP6);
470 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 471
bb270c08
DB
472 vis_ld64_2(ref, 8, TMP16);
473 vis_mul8x16(CONST_128, TMP8, TMP8);
474 vis_or(REF_0, REF_2, TMP10);
44f54ceb 475
bb270c08
DB
476 vis_ld64_2(ref, 16, TMP18);
477 ref += stride;
478 vis_or(REF_4, REF_6, TMP12);
44f54ceb 479
bb270c08 480 vis_alignaddr_g0((void *)off);
44f54ceb 481
bb270c08 482 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 483
bb270c08 484 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 485
bb270c08
DB
486 if (off != 0x7) {
487 vis_alignaddr_g0((void *)off_plus_1);
488 vis_faligndata(TMP0, TMP2, REF_2);
489 vis_faligndata(TMP2, TMP4, REF_6);
490 } else {
491 vis_src1(TMP2, REF_2);
492 vis_src1(TMP4, REF_6);
493 }
44f54ceb 494
bb270c08 495 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 496
bb270c08 497 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 498
bb270c08
DB
499 vis_psub16(TMP10, TMP6, TMP6);
500 vis_st64(TMP6, dest[0]);
44f54ceb 501
bb270c08
DB
502 vis_psub16(TMP12, TMP8, TMP8);
503 vis_st64_2(TMP8, dest, 8);
504 dest += stride;
44f54ceb 505
bb270c08 506 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 507
bb270c08 508 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 509
bb270c08 510 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 511
bb270c08
DB
512 vis_mul8x16(CONST_128, TMP6, TMP6);
513 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 514
bb270c08
DB
515 vis_mul8x16(CONST_128, TMP8, TMP8);
516 vis_or(REF_0, REF_2, TMP10);
44f54ceb 517
bb270c08 518 vis_or(REF_4, REF_6, TMP12);
44f54ceb 519
bb270c08 520 vis_alignaddr_g0((void *)off);
44f54ceb 521
bb270c08 522 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 523
bb270c08 524 vis_faligndata(TMP16, TMP18, REF_4);
44f54ceb 525
bb270c08
DB
526 if (off != 0x7) {
527 vis_alignaddr_g0((void *)off_plus_1);
528 vis_faligndata(TMP14, TMP16, REF_2);
529 vis_faligndata(TMP16, TMP18, REF_6);
530 } else {
531 vis_src1(TMP16, REF_2);
532 vis_src1(TMP18, REF_6);
533 }
44f54ceb 534
bb270c08 535 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 536
bb270c08 537 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 538
bb270c08
DB
539 vis_psub16(TMP10, TMP6, TMP6);
540 vis_st64(TMP6, dest[0]);
44f54ceb 541
bb270c08
DB
542 vis_psub16(TMP12, TMP8, TMP8);
543 vis_st64_2(TMP8, dest, 8);
544 dest += stride;
545 } while (--height);
44f54ceb 546
bb270c08
DB
547 vis_ld64(ref[0], TMP0);
548 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 549
bb270c08
DB
550 vis_ld64_2(ref, 8, TMP2);
551 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 552
bb270c08
DB
553 vis_ld64_2(ref, 16, TMP4);
554 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 555
bb270c08
DB
556 vis_mul8x16(CONST_128, TMP6, TMP6);
557 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 558
bb270c08
DB
559 vis_mul8x16(CONST_128, TMP8, TMP8);
560 vis_or(REF_0, REF_2, TMP10);
44f54ceb 561
bb270c08 562 vis_or(REF_4, REF_6, TMP12);
44f54ceb 563
bb270c08 564 vis_alignaddr_g0((void *)off);
44f54ceb 565
bb270c08 566 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 567
bb270c08 568 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 569
bb270c08
DB
570 if (off != 0x7) {
571 vis_alignaddr_g0((void *)off_plus_1);
572 vis_faligndata(TMP0, TMP2, REF_2);
573 vis_faligndata(TMP2, TMP4, REF_6);
574 } else {
575 vis_src1(TMP2, REF_2);
576 vis_src1(TMP4, REF_6);
577 }
44f54ceb 578
bb270c08 579 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 580
bb270c08 581 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 582
bb270c08
DB
583 vis_psub16(TMP10, TMP6, TMP6);
584 vis_st64(TMP6, dest[0]);
44f54ceb 585
bb270c08
DB
586 vis_psub16(TMP12, TMP8, TMP8);
587 vis_st64_2(TMP8, dest, 8);
588 dest += stride;
44f54ceb 589
bb270c08 590 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 591
bb270c08 592 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 593
bb270c08 594 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 595
bb270c08
DB
596 vis_mul8x16(CONST_128, TMP6, TMP6);
597 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 598
bb270c08
DB
599 vis_mul8x16(CONST_128, TMP8, TMP8);
600 vis_or(REF_0, REF_2, TMP10);
44f54ceb 601
bb270c08 602 vis_or(REF_4, REF_6, TMP12);
44f54ceb 603
bb270c08 604 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 605
bb270c08 606 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 607
bb270c08
DB
608 vis_psub16(TMP10, TMP6, TMP6);
609 vis_st64(TMP6, dest[0]);
44f54ceb 610
bb270c08
DB
611 vis_psub16(TMP12, TMP8, TMP8);
612 vis_st64_2(TMP8, dest, 8);
44f54ceb
MN
613}
614
615static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 616 const int stride, int height)
44f54ceb 617{
bb270c08
DB
618 uint8_t *ref = (uint8_t *) _ref;
619 unsigned long off = (unsigned long) ref & 0x7;
620 unsigned long off_plus_1 = off + 1;
44f54ceb 621
bb270c08 622 ref = vis_alignaddr(ref);
44f54ceb 623
bb270c08 624 vis_ld64(ref[0], TMP0);
44f54ceb 625
bb270c08 626 vis_ld64(ref[8], TMP2);
44f54ceb 627
bb270c08 628 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 629
bb270c08 630 vis_ld64(constants_7f[0], MASK_7f);
44f54ceb 631
bb270c08
DB
632 vis_ld64(constants128[0], CONST_128);
633 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 634
bb270c08
DB
635 if (off != 0x7) {
636 vis_alignaddr_g0((void *)off_plus_1);
637 vis_faligndata(TMP0, TMP2, REF_2);
638 } else {
639 vis_src1(TMP2, REF_2);
640 }
44f54ceb 641
bb270c08
DB
642 ref += stride;
643 height = (height >> 1) - 1;
44f54ceb 644
bb270c08
DB
645 do { /* 20 cycles */
646 vis_ld64(ref[0], TMP0);
647 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 648
bb270c08
DB
649 vis_ld64_2(ref, 8, TMP2);
650 vis_and(TMP4, MASK_fe, TMP4);
651 ref += stride;
44f54ceb 652
bb270c08
DB
653 vis_ld64(ref[0], TMP8);
654 vis_or(REF_0, REF_2, TMP6);
655 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 656
bb270c08 657 vis_alignaddr_g0((void *)off);
44f54ceb 658
bb270c08
DB
659 vis_ld64_2(ref, 8, TMP10);
660 ref += stride;
661 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 662
bb270c08
DB
663 if (off != 0x7) {
664 vis_alignaddr_g0((void *)off_plus_1);
665 vis_faligndata(TMP0, TMP2, REF_2);
666 } else {
667 vis_src1(TMP2, REF_2);
668 }
44f54ceb 669
bb270c08 670 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 671
bb270c08
DB
672 vis_psub16(TMP6, TMP4, DST_0);
673 vis_st64(DST_0, dest[0]);
674 dest += stride;
44f54ceb 675
bb270c08 676 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 677
bb270c08 678 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 679
bb270c08
DB
680 vis_or(REF_0, REF_2, TMP14);
681 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 682
bb270c08
DB
683 vis_alignaddr_g0((void *)off);
684 vis_faligndata(TMP8, TMP10, REF_0);
685 if (off != 0x7) {
686 vis_alignaddr_g0((void *)off_plus_1);
687 vis_faligndata(TMP8, TMP10, REF_2);
688 } else {
689 vis_src1(TMP10, REF_2);
690 }
44f54ceb 691
bb270c08 692 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 693
bb270c08
DB
694 vis_psub16(TMP14, TMP12, DST_0);
695 vis_st64(DST_0, dest[0]);
696 dest += stride;
697 } while (--height);
44f54ceb 698
bb270c08
DB
699 vis_ld64(ref[0], TMP0);
700 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 701
bb270c08
DB
702 vis_ld64_2(ref, 8, TMP2);
703 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 704
bb270c08
DB
705 vis_or(REF_0, REF_2, TMP6);
706 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 707
bb270c08 708 vis_alignaddr_g0((void *)off);
44f54ceb 709
bb270c08 710 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 711
bb270c08
DB
712 if (off != 0x7) {
713 vis_alignaddr_g0((void *)off_plus_1);
714 vis_faligndata(TMP0, TMP2, REF_2);
715 } else {
716 vis_src1(TMP2, REF_2);
717 }
44f54ceb 718
bb270c08 719 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 720
bb270c08
DB
721 vis_psub16(TMP6, TMP4, DST_0);
722 vis_st64(DST_0, dest[0]);
723 dest += stride;
44f54ceb 724
bb270c08 725 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 726
bb270c08 727 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 728
bb270c08
DB
729 vis_or(REF_0, REF_2, TMP14);
730 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 731
bb270c08 732 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 733
bb270c08
DB
734 vis_psub16(TMP14, TMP12, DST_0);
735 vis_st64(DST_0, dest[0]);
736 dest += stride;
44f54ceb
MN
737}
738
739static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 740 const int stride, int height)
44f54ceb 741{
bb270c08
DB
742 uint8_t *ref = (uint8_t *) _ref;
743 unsigned long off = (unsigned long) ref & 0x7;
744 unsigned long off_plus_1 = off + 1;
44f54ceb 745
bb270c08 746 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 747
bb270c08
DB
748 vis_ld64(constants3[0], CONST_3);
749 vis_fzero(ZERO);
750 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 751
bb270c08
DB
752 ref = vis_alignaddr(ref);
753 do { /* 26 cycles */
754 vis_ld64(ref[0], TMP0);
44f54ceb 755
bb270c08 756 vis_ld64(ref[8], TMP2);
44f54ceb 757
bb270c08 758 vis_alignaddr_g0((void *)off);
44f54ceb 759
bb270c08 760 vis_ld64(ref[16], TMP4);
44f54ceb 761
bb270c08
DB
762 vis_ld64(dest[0], DST_0);
763 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 764
bb270c08
DB
765 vis_ld64(dest[8], DST_2);
766 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 767
bb270c08
DB
768 if (off != 0x7) {
769 vis_alignaddr_g0((void *)off_plus_1);
770 vis_faligndata(TMP0, TMP2, REF_2);
771 vis_faligndata(TMP2, TMP4, REF_6);
772 } else {
773 vis_src1(TMP2, REF_2);
774 vis_src1(TMP4, REF_6);
775 }
44f54ceb 776
bb270c08 777 vis_mul8x16au(REF_0, CONST_256, TMP0);
44f54ceb 778
bb270c08
DB
779 vis_pmerge(ZERO, REF_2, TMP4);
780 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 781
bb270c08 782 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 783
bb270c08 784 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 785
bb270c08
DB
786 vis_mul8x16al(DST_0, CONST_512, TMP4);
787 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 788
bb270c08 789 vis_mul8x16al(DST_1, CONST_512, TMP6);
44f54ceb 790
bb270c08 791 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 792
bb270c08
DB
793 vis_padd16(TMP0, TMP4, TMP0);
794 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 795
bb270c08
DB
796 vis_padd16(TMP2, TMP6, TMP2);
797 vis_mul8x16au(REF_4, CONST_256, TMP16);
44f54ceb 798
bb270c08
DB
799 vis_padd16(TMP0, CONST_3, TMP8);
800 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
44f54ceb 801
bb270c08
DB
802 vis_padd16(TMP2, CONST_3, TMP10);
803 vis_pack16(TMP8, DST_0);
44f54ceb 804
bb270c08
DB
805 vis_pack16(TMP10, DST_1);
806 vis_padd16(TMP16, TMP12, TMP0);
44f54ceb 807
bb270c08
DB
808 vis_st64(DST_0, dest[0]);
809 vis_mul8x16al(DST_2, CONST_512, TMP4);
810 vis_padd16(TMP18, TMP14, TMP2);
44f54ceb 811
bb270c08
DB
812 vis_mul8x16al(DST_3, CONST_512, TMP6);
813 vis_padd16(TMP0, CONST_3, TMP0);
44f54ceb 814
bb270c08 815 vis_padd16(TMP2, CONST_3, TMP2);
44f54ceb 816
bb270c08 817 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 818
bb270c08
DB
819 vis_padd16(TMP2, TMP6, TMP2);
820 vis_pack16(TMP0, DST_2);
44f54ceb 821
bb270c08
DB
822 vis_pack16(TMP2, DST_3);
823 vis_st64(DST_2, dest[8]);
44f54ceb 824
bb270c08
DB
825 ref += stride;
826 dest += stride;
827 } while (--height);
44f54ceb
MN
828}
829
830static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 831 const int stride, int height)
44f54ceb 832{
bb270c08
DB
833 uint8_t *ref = (uint8_t *) _ref;
834 unsigned long off = (unsigned long) ref & 0x7;
835 unsigned long off_plus_1 = off + 1;
836 int stride_times_2 = stride << 1;
44f54ceb 837
bb270c08 838 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 839
bb270c08
DB
840 vis_ld64(constants3[0], CONST_3);
841 vis_fzero(ZERO);
842 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 843
bb270c08
DB
844 ref = vis_alignaddr(ref);
845 height >>= 2;
846 do { /* 47 cycles */
847 vis_ld64(ref[0], TMP0);
44f54ceb 848
bb270c08
DB
849 vis_ld64_2(ref, 8, TMP2);
850 ref += stride;
44f54ceb 851
bb270c08 852 vis_alignaddr_g0((void *)off);
44f54ceb 853
bb270c08
DB
854 vis_ld64(ref[0], TMP4);
855 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 856
bb270c08
DB
857 vis_ld64_2(ref, 8, TMP6);
858 ref += stride;
44f54ceb 859
bb270c08 860 vis_ld64(ref[0], TMP8);
44f54ceb 861
bb270c08
DB
862 vis_ld64_2(ref, 8, TMP10);
863 ref += stride;
864 vis_faligndata(TMP4, TMP6, REF_4);
44f54ceb 865
bb270c08 866 vis_ld64(ref[0], TMP12);
44f54ceb 867
bb270c08
DB
868 vis_ld64_2(ref, 8, TMP14);
869 ref += stride;
870 vis_faligndata(TMP8, TMP10, REF_S0);
44f54ceb 871
bb270c08 872 vis_faligndata(TMP12, TMP14, REF_S4);
44f54ceb 873
bb270c08
DB
874 if (off != 0x7) {
875 vis_alignaddr_g0((void *)off_plus_1);
44f54ceb 876
bb270c08
DB
877 vis_ld64(dest[0], DST_0);
878 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 879
bb270c08
DB
880 vis_ld64_2(dest, stride, DST_2);
881 vis_faligndata(TMP4, TMP6, REF_6);
44f54ceb 882
bb270c08 883 vis_faligndata(TMP8, TMP10, REF_S2);
44f54ceb 884
bb270c08
DB
885 vis_faligndata(TMP12, TMP14, REF_S6);
886 } else {
887 vis_ld64(dest[0], DST_0);
888 vis_src1(TMP2, REF_2);
44f54ceb 889
bb270c08
DB
890 vis_ld64_2(dest, stride, DST_2);
891 vis_src1(TMP6, REF_6);
44f54ceb 892
bb270c08 893 vis_src1(TMP10, REF_S2);
44f54ceb 894
bb270c08
DB
895 vis_src1(TMP14, REF_S6);
896 }
44f54ceb 897
bb270c08
DB
898 vis_pmerge(ZERO, REF_0, TMP0);
899 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 900
bb270c08
DB
901 vis_pmerge(ZERO, REF_2, TMP4);
902 vis_mul8x16au(REF_2_1, CONST_256, TMP6);
44f54ceb 903
bb270c08
DB
904 vis_padd16(TMP0, CONST_3, TMP0);
905 vis_mul8x16al(DST_0, CONST_512, TMP16);
44f54ceb 906
bb270c08
DB
907 vis_padd16(TMP2, CONST_3, TMP2);
908 vis_mul8x16al(DST_1, CONST_512, TMP18);
44f54ceb 909
bb270c08
DB
910 vis_padd16(TMP0, TMP4, TMP0);
911 vis_mul8x16au(REF_4, CONST_256, TMP8);
44f54ceb 912
bb270c08
DB
913 vis_padd16(TMP2, TMP6, TMP2);
914 vis_mul8x16au(REF_4_1, CONST_256, TMP10);
44f54ceb 915
bb270c08
DB
916 vis_padd16(TMP0, TMP16, TMP0);
917 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 918
bb270c08
DB
919 vis_padd16(TMP2, TMP18, TMP2);
920 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 921
bb270c08
DB
922 vis_padd16(TMP8, CONST_3, TMP8);
923 vis_mul8x16al(DST_2, CONST_512, TMP16);
44f54ceb 924
bb270c08
DB
925 vis_padd16(TMP8, TMP12, TMP8);
926 vis_mul8x16al(DST_3, CONST_512, TMP18);
44f54ceb 927
bb270c08
DB
928 vis_padd16(TMP10, TMP14, TMP10);
929 vis_pack16(TMP0, DST_0);
44f54ceb 930
bb270c08
DB
931 vis_pack16(TMP2, DST_1);
932 vis_st64(DST_0, dest[0]);
933 dest += stride;
934 vis_padd16(TMP10, CONST_3, TMP10);
44f54ceb 935
bb270c08
DB
936 vis_ld64_2(dest, stride, DST_0);
937 vis_padd16(TMP8, TMP16, TMP8);
44f54ceb 938
bb270c08
DB
939 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
940 vis_padd16(TMP10, TMP18, TMP10);
941 vis_pack16(TMP8, DST_2);
44f54ceb 942
bb270c08
DB
943 vis_pack16(TMP10, DST_3);
944 vis_st64(DST_2, dest[0]);
945 dest += stride;
44f54ceb 946
bb270c08
DB
947 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
948 vis_pmerge(ZERO, REF_S0, TMP0);
44f54ceb 949
bb270c08
DB
950 vis_pmerge(ZERO, REF_S2, TMP24);
951 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
44f54ceb 952
bb270c08
DB
953 vis_padd16(TMP0, CONST_3, TMP0);
954 vis_mul8x16au(REF_S4, CONST_256, TMP8);
44f54ceb 955
bb270c08
DB
956 vis_padd16(TMP2, CONST_3, TMP2);
957 vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
44f54ceb 958
bb270c08
DB
959 vis_padd16(TMP0, TMP24, TMP0);
960 vis_mul8x16au(REF_S6, CONST_256, TMP12);
44f54ceb 961
bb270c08
DB
962 vis_padd16(TMP2, TMP6, TMP2);
963 vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
44f54ceb 964
bb270c08
DB
965 vis_padd16(TMP8, CONST_3, TMP8);
966 vis_mul8x16al(DST_0, CONST_512, TMP16);
44f54ceb 967
bb270c08
DB
968 vis_padd16(TMP10, CONST_3, TMP10);
969 vis_mul8x16al(DST_1, CONST_512, TMP18);
44f54ceb 970
bb270c08
DB
971 vis_padd16(TMP8, TMP12, TMP8);
972 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
44f54ceb 973
bb270c08
DB
974 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
975 vis_padd16(TMP0, TMP16, TMP0);
44f54ceb 976
bb270c08
DB
977 vis_padd16(TMP2, TMP18, TMP2);
978 vis_pack16(TMP0, DST_0);
44f54ceb 979
bb270c08
DB
980 vis_padd16(TMP10, TMP14, TMP10);
981 vis_pack16(TMP2, DST_1);
982 vis_st64(DST_0, dest[0]);
983 dest += stride;
44f54ceb 984
bb270c08 985 vis_padd16(TMP8, TMP20, TMP8);
44f54ceb 986
bb270c08
DB
987 vis_padd16(TMP10, TMP22, TMP10);
988 vis_pack16(TMP8, DST_2);
44f54ceb 989
bb270c08
DB
990 vis_pack16(TMP10, DST_3);
991 vis_st64(DST_2, dest[0]);
992 dest += stride;
993 } while (--height);
44f54ceb
MN
994}
995
996static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 997 const int stride, int height)
44f54ceb 998{
bb270c08 999 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 1000
bb270c08
DB
1001 ref = vis_alignaddr(ref);
1002 vis_ld64(ref[0], TMP0);
44f54ceb 1003
bb270c08 1004 vis_ld64_2(ref, 8, TMP2);
44f54ceb 1005
bb270c08
DB
1006 vis_ld64_2(ref, 16, TMP4);
1007 ref += stride;
44f54ceb 1008
bb270c08
DB
1009 vis_ld64(ref[0], TMP6);
1010 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1011
bb270c08
DB
1012 vis_ld64_2(ref, 8, TMP8);
1013 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1014
bb270c08
DB
1015 vis_ld64_2(ref, 16, TMP10);
1016 ref += stride;
44f54ceb 1017
bb270c08
DB
1018 vis_ld64(constants_fe[0], MASK_fe);
1019 vis_faligndata(TMP6, TMP8, REF_2);
44f54ceb 1020
bb270c08
DB
1021 vis_ld64(constants_7f[0], MASK_7f);
1022 vis_faligndata(TMP8, TMP10, REF_6);
44f54ceb 1023
bb270c08
DB
1024 vis_ld64(constants128[0], CONST_128);
1025 height = (height >> 1) - 1;
1026 do { /* 24 cycles */
1027 vis_ld64(ref[0], TMP0);
1028 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1029
bb270c08
DB
1030 vis_ld64_2(ref, 8, TMP2);
1031 vis_xor(REF_4, REF_6, TMP16);
44f54ceb 1032
bb270c08
DB
1033 vis_ld64_2(ref, 16, TMP4);
1034 ref += stride;
1035 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1036
bb270c08
DB
1037 vis_ld64(ref[0], TMP6);
1038 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1039
bb270c08
DB
1040 vis_ld64_2(ref, 8, TMP8);
1041 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1042
bb270c08
DB
1043 vis_ld64_2(ref, 16, TMP10);
1044 ref += stride;
1045 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1046
bb270c08 1047 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1048
bb270c08
DB
1049 vis_and(TMP16, MASK_fe, TMP16);
1050 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 1051
bb270c08
DB
1052 vis_mul8x16(CONST_128, TMP16, TMP16);
1053 vis_xor(REF_0, REF_2, TMP0);
44f54ceb 1054
bb270c08 1055 vis_xor(REF_4, REF_6, TMP2);
44f54ceb 1056
bb270c08 1057 vis_or(REF_0, REF_2, TMP20);
44f54ceb 1058
bb270c08 1059 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1060
bb270c08 1061 vis_and(TMP16, MASK_7f, TMP16);
44f54ceb 1062
bb270c08
DB
1063 vis_psub16(TMP14, TMP12, TMP12);
1064 vis_st64(TMP12, dest[0]);
44f54ceb 1065
bb270c08
DB
1066 vis_psub16(TMP18, TMP16, TMP16);
1067 vis_st64_2(TMP16, dest, 8);
1068 dest += stride;
44f54ceb 1069
bb270c08 1070 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1071
bb270c08 1072 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 1073
bb270c08
DB
1074 vis_and(TMP2, MASK_fe, TMP2);
1075 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 1076
bb270c08
DB
1077 vis_faligndata(TMP6, TMP8, REF_2);
1078 vis_mul8x16(CONST_128, TMP2, TMP2);
44f54ceb 1079
bb270c08 1080 vis_faligndata(TMP8, TMP10, REF_6);
44f54ceb 1081
bb270c08 1082 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 1083
bb270c08 1084 vis_and(TMP2, MASK_7f, TMP2);
44f54ceb 1085
bb270c08
DB
1086 vis_psub16(TMP20, TMP0, TMP0);
1087 vis_st64(TMP0, dest[0]);
44f54ceb 1088
bb270c08
DB
1089 vis_psub16(TMP18, TMP2, TMP2);
1090 vis_st64_2(TMP2, dest, 8);
1091 dest += stride;
1092 } while (--height);
44f54ceb 1093
bb270c08
DB
1094 vis_ld64(ref[0], TMP0);
1095 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1096
bb270c08
DB
1097 vis_ld64_2(ref, 8, TMP2);
1098 vis_xor(REF_4, REF_6, TMP16);
44f54ceb 1099
bb270c08
DB
1100 vis_ld64_2(ref, 16, TMP4);
1101 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1102
bb270c08 1103 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1104
bb270c08 1105 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1106
bb270c08 1107 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1108
bb270c08 1109 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1110
bb270c08
DB
1111 vis_and(TMP16, MASK_fe, TMP16);
1112 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 1113
bb270c08
DB
1114 vis_mul8x16(CONST_128, TMP16, TMP16);
1115 vis_xor(REF_0, REF_2, TMP0);
44f54ceb 1116
bb270c08 1117 vis_xor(REF_4, REF_6, TMP2);
44f54ceb 1118
bb270c08 1119 vis_or(REF_0, REF_2, TMP20);
44f54ceb 1120
bb270c08 1121 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1122
bb270c08 1123 vis_and(TMP16, MASK_7f, TMP16);
44f54ceb 1124
bb270c08
DB
1125 vis_psub16(TMP14, TMP12, TMP12);
1126 vis_st64(TMP12, dest[0]);
44f54ceb 1127
bb270c08
DB
1128 vis_psub16(TMP18, TMP16, TMP16);
1129 vis_st64_2(TMP16, dest, 8);
1130 dest += stride;
44f54ceb 1131
bb270c08 1132 vis_or(REF_4, REF_6, TMP18);
44f54ceb 1133
bb270c08 1134 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 1135
bb270c08
DB
1136 vis_and(TMP2, MASK_fe, TMP2);
1137 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 1138
bb270c08 1139 vis_mul8x16(CONST_128, TMP2, TMP2);
44f54ceb 1140
bb270c08 1141 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 1142
bb270c08 1143 vis_and(TMP2, MASK_7f, TMP2);
44f54ceb 1144
bb270c08
DB
1145 vis_psub16(TMP20, TMP0, TMP0);
1146 vis_st64(TMP0, dest[0]);
44f54ceb 1147
bb270c08
DB
1148 vis_psub16(TMP18, TMP2, TMP2);
1149 vis_st64_2(TMP2, dest, 8);
44f54ceb
MN
1150}
1151
1152static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1153 const int stride, int height)
44f54ceb 1154{
bb270c08 1155 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 1156
bb270c08
DB
1157 ref = vis_alignaddr(ref);
1158 vis_ld64(ref[0], TMP0);
44f54ceb 1159
bb270c08
DB
1160 vis_ld64_2(ref, 8, TMP2);
1161 ref += stride;
44f54ceb 1162
bb270c08 1163 vis_ld64(ref[0], TMP4);
44f54ceb 1164
bb270c08
DB
1165 vis_ld64_2(ref, 8, TMP6);
1166 ref += stride;
44f54ceb 1167
bb270c08
DB
1168 vis_ld64(constants_fe[0], MASK_fe);
1169 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1170
bb270c08
DB
1171 vis_ld64(constants_7f[0], MASK_7f);
1172 vis_faligndata(TMP4, TMP6, REF_2);
44f54ceb 1173
bb270c08
DB
1174 vis_ld64(constants128[0], CONST_128);
1175 height = (height >> 1) - 1;
1176 do { /* 12 cycles */
1177 vis_ld64(ref[0], TMP0);
1178 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 1179
bb270c08
DB
1180 vis_ld64_2(ref, 8, TMP2);
1181 ref += stride;
1182 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 1183
bb270c08
DB
1184 vis_or(REF_0, REF_2, TMP6);
1185 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 1186
bb270c08
DB
1187 vis_faligndata(TMP0, TMP2, REF_0);
1188 vis_ld64(ref[0], TMP0);
44f54ceb 1189
bb270c08
DB
1190 vis_ld64_2(ref, 8, TMP2);
1191 ref += stride;
1192 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1193
bb270c08 1194 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 1195
bb270c08 1196 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1197
bb270c08
DB
1198 vis_mul8x16(CONST_128, TMP12, TMP12);
1199 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1200
bb270c08
DB
1201 vis_psub16(TMP6, TMP4, DST_0);
1202 vis_st64(DST_0, dest[0]);
1203 dest += stride;
44f54ceb 1204
bb270c08 1205 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1206
bb270c08 1207 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1208
bb270c08
DB
1209 vis_psub16(TMP14, TMP12, DST_0);
1210 vis_st64(DST_0, dest[0]);
1211 dest += stride;
1212 } while (--height);
44f54ceb 1213
bb270c08
DB
1214 vis_ld64(ref[0], TMP0);
1215 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 1216
bb270c08
DB
1217 vis_ld64_2(ref, 8, TMP2);
1218 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 1219
bb270c08
DB
1220 vis_or(REF_0, REF_2, TMP6);
1221 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 1222
bb270c08 1223 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1224
bb270c08 1225 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 1226
bb270c08 1227 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 1228
bb270c08 1229 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 1230
bb270c08
DB
1231 vis_mul8x16(CONST_128, TMP12, TMP12);
1232 vis_or(REF_0, REF_2, TMP14);
44f54ceb 1233
bb270c08
DB
1234 vis_psub16(TMP6, TMP4, DST_0);
1235 vis_st64(DST_0, dest[0]);
1236 dest += stride;
44f54ceb 1237
bb270c08 1238 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 1239
bb270c08
DB
1240 vis_psub16(TMP14, TMP12, DST_0);
1241 vis_st64(DST_0, dest[0]);
44f54ceb
MN
1242}
1243
1244static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1245 const int stride, int height)
44f54ceb 1246{
bb270c08
DB
1247 uint8_t *ref = (uint8_t *) _ref;
1248 int stride_8 = stride + 8;
1249 int stride_16 = stride + 16;
44f54ceb 1250
bb270c08 1251 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1252
bb270c08 1253 ref = vis_alignaddr(ref);
44f54ceb 1254
bb270c08
DB
1255 vis_ld64(ref[ 0], TMP0);
1256 vis_fzero(ZERO);
44f54ceb 1257
bb270c08 1258 vis_ld64(ref[ 8], TMP2);
44f54ceb 1259
bb270c08 1260 vis_ld64(ref[16], TMP4);
44f54ceb 1261
bb270c08
DB
1262 vis_ld64(constants3[0], CONST_3);
1263 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1264
bb270c08
DB
1265 vis_ld64(constants256_512[0], CONST_256);
1266 vis_faligndata(TMP2, TMP4, REF_6);
1267 height >>= 1;
44f54ceb 1268
bb270c08
DB
1269 do { /* 31 cycles */
1270 vis_ld64_2(ref, stride, TMP0);
1271 vis_pmerge(ZERO, REF_2, TMP12);
1272 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
44f54ceb 1273
bb270c08
DB
1274 vis_ld64_2(ref, stride_8, TMP2);
1275 vis_pmerge(ZERO, REF_6, TMP16);
1276 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
44f54ceb 1277
bb270c08
DB
1278 vis_ld64_2(ref, stride_16, TMP4);
1279 ref += stride;
44f54ceb 1280
bb270c08
DB
1281 vis_ld64(dest[0], DST_0);
1282 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1283
bb270c08
DB
1284 vis_ld64_2(dest, 8, DST_2);
1285 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1286
bb270c08
DB
1287 vis_ld64_2(ref, stride, TMP6);
1288 vis_pmerge(ZERO, REF_0, TMP0);
1289 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 1290
bb270c08
DB
1291 vis_ld64_2(ref, stride_8, TMP8);
1292 vis_pmerge(ZERO, REF_4, TMP4);
44f54ceb 1293
bb270c08
DB
1294 vis_ld64_2(ref, stride_16, TMP10);
1295 ref += stride;
44f54ceb 1296
bb270c08
DB
1297 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
1298 vis_faligndata(TMP6, TMP8, REF_2);
1299 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
44f54ceb 1300
bb270c08
DB
1301 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
1302 vis_faligndata(TMP8, TMP10, REF_6);
1303 vis_mul8x16al(DST_0, CONST_512, TMP20);
44f54ceb 1304
bb270c08
DB
1305 vis_padd16(TMP0, CONST_3, TMP0);
1306 vis_mul8x16al(DST_1, CONST_512, TMP22);
44f54ceb 1307
bb270c08
DB
1308 vis_padd16(TMP2, CONST_3, TMP2);
1309 vis_mul8x16al(DST_2, CONST_512, TMP24);
44f54ceb 1310
bb270c08
DB
1311 vis_padd16(TMP4, CONST_3, TMP4);
1312 vis_mul8x16al(DST_3, CONST_512, TMP26);
44f54ceb 1313
bb270c08 1314 vis_padd16(TMP6, CONST_3, TMP6);
44f54ceb 1315
bb270c08
DB
1316 vis_padd16(TMP12, TMP20, TMP12);
1317 vis_mul8x16al(REF_S0, CONST_512, TMP20);
44f54ceb 1318
bb270c08
DB
1319 vis_padd16(TMP14, TMP22, TMP14);
1320 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
44f54ceb 1321
bb270c08
DB
1322 vis_padd16(TMP16, TMP24, TMP16);
1323 vis_mul8x16al(REF_S2, CONST_512, TMP24);
44f54ceb 1324
bb270c08
DB
1325 vis_padd16(TMP18, TMP26, TMP18);
1326 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
44f54ceb 1327
bb270c08
DB
1328 vis_padd16(TMP12, TMP0, TMP12);
1329 vis_mul8x16au(REF_2, CONST_256, TMP28);
44f54ceb 1330
bb270c08
DB
1331 vis_padd16(TMP14, TMP2, TMP14);
1332 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
44f54ceb 1333
bb270c08
DB
1334 vis_padd16(TMP16, TMP4, TMP16);
1335 vis_mul8x16au(REF_6, CONST_256, REF_S4);
44f54ceb 1336
bb270c08
DB
1337 vis_padd16(TMP18, TMP6, TMP18);
1338 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
44f54ceb 1339
bb270c08
DB
1340 vis_pack16(TMP12, DST_0);
1341 vis_padd16(TMP28, TMP0, TMP12);
44f54ceb 1342
bb270c08
DB
1343 vis_pack16(TMP14, DST_1);
1344 vis_st64(DST_0, dest[0]);
1345 vis_padd16(TMP30, TMP2, TMP14);
44f54ceb 1346
bb270c08
DB
1347 vis_pack16(TMP16, DST_2);
1348 vis_padd16(REF_S4, TMP4, TMP16);
44f54ceb 1349
bb270c08
DB
1350 vis_pack16(TMP18, DST_3);
1351 vis_st64_2(DST_2, dest, 8);
1352 dest += stride;
1353 vis_padd16(REF_S6, TMP6, TMP18);
44f54ceb 1354
bb270c08 1355 vis_padd16(TMP12, TMP20, TMP12);
44f54ceb 1356
bb270c08
DB
1357 vis_padd16(TMP14, TMP22, TMP14);
1358 vis_pack16(TMP12, DST_0);
44f54ceb 1359
bb270c08
DB
1360 vis_padd16(TMP16, TMP24, TMP16);
1361 vis_pack16(TMP14, DST_1);
1362 vis_st64(DST_0, dest[0]);
44f54ceb 1363
bb270c08
DB
1364 vis_padd16(TMP18, TMP26, TMP18);
1365 vis_pack16(TMP16, DST_2);
44f54ceb 1366
bb270c08
DB
1367 vis_pack16(TMP18, DST_3);
1368 vis_st64_2(DST_2, dest, 8);
1369 dest += stride;
1370 } while (--height);
44f54ceb
MN
1371}
1372
1373static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1374 const int stride, int height)
44f54ceb 1375{
bb270c08
DB
1376 uint8_t *ref = (uint8_t *) _ref;
1377 int stride_8 = stride + 8;
44f54ceb 1378
bb270c08 1379 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1380
bb270c08 1381 ref = vis_alignaddr(ref);
44f54ceb 1382
bb270c08
DB
1383 vis_ld64(ref[ 0], TMP0);
1384 vis_fzero(ZERO);
44f54ceb 1385
bb270c08 1386 vis_ld64(ref[ 8], TMP2);
44f54ceb 1387
bb270c08
DB
1388 vis_ld64(constants3[0], CONST_3);
1389 vis_faligndata(TMP0, TMP2, REF_2);
44f54ceb 1390
bb270c08 1391 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 1392
bb270c08
DB
1393 height >>= 1;
1394 do { /* 20 cycles */
1395 vis_ld64_2(ref, stride, TMP0);
1396 vis_pmerge(ZERO, REF_2, TMP8);
1397 vis_mul8x16au(REF_2_1, CONST_256, TMP10);
44f54ceb 1398
bb270c08
DB
1399 vis_ld64_2(ref, stride_8, TMP2);
1400 ref += stride;
44f54ceb 1401
bb270c08 1402 vis_ld64(dest[0], DST_0);
44f54ceb 1403
bb270c08
DB
1404 vis_ld64_2(dest, stride, DST_2);
1405 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1406
bb270c08
DB
1407 vis_ld64_2(ref, stride, TMP4);
1408 vis_mul8x16al(DST_0, CONST_512, TMP16);
1409 vis_pmerge(ZERO, REF_0, TMP12);
44f54ceb 1410
bb270c08
DB
1411 vis_ld64_2(ref, stride_8, TMP6);
1412 ref += stride;
1413 vis_mul8x16al(DST_1, CONST_512, TMP18);
1414 vis_pmerge(ZERO, REF_0_1, TMP14);
44f54ceb 1415
bb270c08
DB
1416 vis_padd16(TMP12, CONST_3, TMP12);
1417 vis_mul8x16al(DST_2, CONST_512, TMP24);
44f54ceb 1418
bb270c08
DB
1419 vis_padd16(TMP14, CONST_3, TMP14);
1420 vis_mul8x16al(DST_3, CONST_512, TMP26);
44f54ceb 1421
bb270c08 1422 vis_faligndata(TMP4, TMP6, REF_2);
44f54ceb 1423
bb270c08 1424 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1425
bb270c08
DB
1426 vis_padd16(TMP10, TMP14, TMP10);
1427 vis_mul8x16au(REF_2, CONST_256, TMP20);
44f54ceb 1428
bb270c08
DB
1429 vis_padd16(TMP8, TMP16, TMP0);
1430 vis_mul8x16au(REF_2_1, CONST_256, TMP22);
44f54ceb 1431
bb270c08
DB
1432 vis_padd16(TMP10, TMP18, TMP2);
1433 vis_pack16(TMP0, DST_0);
44f54ceb 1434
bb270c08
DB
1435 vis_pack16(TMP2, DST_1);
1436 vis_st64(DST_0, dest[0]);
1437 dest += stride;
1438 vis_padd16(TMP12, TMP20, TMP12);
44f54ceb 1439
bb270c08 1440 vis_padd16(TMP14, TMP22, TMP14);
44f54ceb 1441
bb270c08 1442 vis_padd16(TMP12, TMP24, TMP0);
44f54ceb 1443
bb270c08
DB
1444 vis_padd16(TMP14, TMP26, TMP2);
1445 vis_pack16(TMP0, DST_2);
44f54ceb 1446
bb270c08
DB
1447 vis_pack16(TMP2, DST_3);
1448 vis_st64(DST_2, dest[0]);
1449 dest += stride;
1450 } while (--height);
44f54ceb
MN
1451}
1452
1453static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1454 const int stride, int height)
44f54ceb 1455{
bb270c08
DB
1456 uint8_t *ref = (uint8_t *) _ref;
1457 unsigned long off = (unsigned long) ref & 0x7;
1458 unsigned long off_plus_1 = off + 1;
1459 int stride_8 = stride + 8;
1460 int stride_16 = stride + 16;
44f54ceb 1461
bb270c08 1462 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1463
bb270c08 1464 ref = vis_alignaddr(ref);
44f54ceb 1465
bb270c08
DB
1466 vis_ld64(ref[ 0], TMP0);
1467 vis_fzero(ZERO);
44f54ceb 1468
bb270c08 1469 vis_ld64(ref[ 8], TMP2);
44f54ceb 1470
bb270c08 1471 vis_ld64(ref[16], TMP4);
44f54ceb 1472
bb270c08
DB
1473 vis_ld64(constants2[0], CONST_2);
1474 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1475
bb270c08
DB
1476 vis_ld64(constants256_512[0], CONST_256);
1477 vis_faligndata(TMP2, TMP4, REF_S4);
44f54ceb 1478
bb270c08
DB
1479 if (off != 0x7) {
1480 vis_alignaddr_g0((void *)off_plus_1);
1481 vis_faligndata(TMP0, TMP2, REF_S2);
1482 vis_faligndata(TMP2, TMP4, REF_S6);
1483 } else {
1484 vis_src1(TMP2, REF_S2);
1485 vis_src1(TMP4, REF_S6);
1486 }
44f54ceb 1487
bb270c08
DB
1488 height >>= 1;
1489 do {
1490 vis_ld64_2(ref, stride, TMP0);
1491 vis_mul8x16au(REF_S0, CONST_256, TMP12);
1492 vis_pmerge(ZERO, REF_S0_1, TMP14);
44f54ceb 1493
bb270c08 1494 vis_alignaddr_g0((void *)off);
44f54ceb 1495
bb270c08
DB
1496 vis_ld64_2(ref, stride_8, TMP2);
1497 vis_mul8x16au(REF_S2, CONST_256, TMP16);
1498 vis_pmerge(ZERO, REF_S2_1, TMP18);
44f54ceb 1499
bb270c08
DB
1500 vis_ld64_2(ref, stride_16, TMP4);
1501 ref += stride;
1502 vis_mul8x16au(REF_S4, CONST_256, TMP20);
1503 vis_pmerge(ZERO, REF_S4_1, TMP22);
44f54ceb 1504
bb270c08
DB
1505 vis_ld64_2(ref, stride, TMP6);
1506 vis_mul8x16au(REF_S6, CONST_256, TMP24);
1507 vis_pmerge(ZERO, REF_S6_1, TMP26);
44f54ceb 1508
bb270c08
DB
1509 vis_ld64_2(ref, stride_8, TMP8);
1510 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1511
bb270c08
DB
1512 vis_ld64_2(ref, stride_16, TMP10);
1513 ref += stride;
1514 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1515
bb270c08 1516 vis_faligndata(TMP6, TMP8, REF_S0);
44f54ceb 1517
bb270c08 1518 vis_faligndata(TMP8, TMP10, REF_S4);
44f54ceb 1519
bb270c08
DB
1520 if (off != 0x7) {
1521 vis_alignaddr_g0((void *)off_plus_1);
1522 vis_faligndata(TMP0, TMP2, REF_2);
1523 vis_faligndata(TMP2, TMP4, REF_6);
1524 vis_faligndata(TMP6, TMP8, REF_S2);
1525 vis_faligndata(TMP8, TMP10, REF_S6);
1526 } else {
1527 vis_src1(TMP2, REF_2);
1528 vis_src1(TMP4, REF_6);
1529 vis_src1(TMP8, REF_S2);
1530 vis_src1(TMP10, REF_S6);
1531 }
44f54ceb 1532
bb270c08
DB
1533 vis_mul8x16au(REF_0, CONST_256, TMP0);
1534 vis_pmerge(ZERO, REF_0_1, TMP2);
44f54ceb 1535
bb270c08
DB
1536 vis_mul8x16au(REF_2, CONST_256, TMP4);
1537 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 1538
bb270c08
DB
1539 vis_padd16(TMP0, CONST_2, TMP8);
1540 vis_mul8x16au(REF_4, CONST_256, TMP0);
44f54ceb 1541
bb270c08
DB
1542 vis_padd16(TMP2, CONST_2, TMP10);
1543 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
44f54ceb 1544
bb270c08
DB
1545 vis_padd16(TMP8, TMP4, TMP8);
1546 vis_mul8x16au(REF_6, CONST_256, TMP4);
44f54ceb 1547
bb270c08
DB
1548 vis_padd16(TMP10, TMP6, TMP10);
1549 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
44f54ceb 1550
bb270c08 1551 vis_padd16(TMP12, TMP8, TMP12);
44f54ceb 1552
bb270c08 1553 vis_padd16(TMP14, TMP10, TMP14);
44f54ceb 1554
bb270c08 1555 vis_padd16(TMP12, TMP16, TMP12);
44f54ceb 1556
bb270c08
DB
1557 vis_padd16(TMP14, TMP18, TMP14);
1558 vis_pack16(TMP12, DST_0);
44f54ceb 1559
bb270c08
DB
1560 vis_pack16(TMP14, DST_1);
1561 vis_st64(DST_0, dest[0]);
1562 vis_padd16(TMP0, CONST_2, TMP12);
44f54ceb 1563
bb270c08
DB
1564 vis_mul8x16au(REF_S0, CONST_256, TMP0);
1565 vis_padd16(TMP2, CONST_2, TMP14);
44f54ceb 1566
bb270c08
DB
1567 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
1568 vis_padd16(TMP12, TMP4, TMP12);
44f54ceb 1569
bb270c08
DB
1570 vis_mul8x16au(REF_S2, CONST_256, TMP4);
1571 vis_padd16(TMP14, TMP6, TMP14);
44f54ceb 1572
bb270c08
DB
1573 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
1574 vis_padd16(TMP20, TMP12, TMP20);
44f54ceb 1575
bb270c08 1576 vis_padd16(TMP22, TMP14, TMP22);
44f54ceb 1577
bb270c08 1578 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1579
bb270c08
DB
1580 vis_padd16(TMP22, TMP26, TMP22);
1581 vis_pack16(TMP20, DST_2);
44f54ceb 1582
bb270c08
DB
1583 vis_pack16(TMP22, DST_3);
1584 vis_st64_2(DST_2, dest, 8);
1585 dest += stride;
1586 vis_padd16(TMP0, TMP4, TMP24);
44f54ceb 1587
bb270c08
DB
1588 vis_mul8x16au(REF_S4, CONST_256, TMP0);
1589 vis_padd16(TMP2, TMP6, TMP26);
44f54ceb 1590
bb270c08
DB
1591 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
1592 vis_padd16(TMP24, TMP8, TMP24);
44f54ceb 1593
bb270c08
DB
1594 vis_padd16(TMP26, TMP10, TMP26);
1595 vis_pack16(TMP24, DST_0);
44f54ceb 1596
bb270c08
DB
1597 vis_pack16(TMP26, DST_1);
1598 vis_st64(DST_0, dest[0]);
1599 vis_pmerge(ZERO, REF_S6, TMP4);
44f54ceb 1600
bb270c08 1601 vis_pmerge(ZERO, REF_S6_1, TMP6);
44f54ceb 1602
bb270c08 1603 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 1604
bb270c08 1605 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 1606
bb270c08 1607 vis_padd16(TMP0, TMP12, TMP0);
44f54ceb 1608
bb270c08
DB
1609 vis_padd16(TMP2, TMP14, TMP2);
1610 vis_pack16(TMP0, DST_2);
44f54ceb 1611
bb270c08
DB
1612 vis_pack16(TMP2, DST_3);
1613 vis_st64_2(DST_2, dest, 8);
1614 dest += stride;
1615 } while (--height);
44f54ceb
MN
1616}
1617
1618static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1619 const int stride, int height)
44f54ceb 1620{
bb270c08
DB
1621 uint8_t *ref = (uint8_t *) _ref;
1622 unsigned long off = (unsigned long) ref & 0x7;
1623 unsigned long off_plus_1 = off + 1;
1624 int stride_8 = stride + 8;
44f54ceb 1625
bb270c08 1626 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1627
bb270c08 1628 ref = vis_alignaddr(ref);
44f54ceb 1629
bb270c08
DB
1630 vis_ld64(ref[ 0], TMP0);
1631 vis_fzero(ZERO);
44f54ceb 1632
bb270c08 1633 vis_ld64(ref[ 8], TMP2);
44f54ceb 1634
bb270c08 1635 vis_ld64(constants2[0], CONST_2);
44f54ceb 1636
bb270c08
DB
1637 vis_ld64(constants256_512[0], CONST_256);
1638 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1639
bb270c08
DB
1640 if (off != 0x7) {
1641 vis_alignaddr_g0((void *)off_plus_1);
1642 vis_faligndata(TMP0, TMP2, REF_S2);
1643 } else {
1644 vis_src1(TMP2, REF_S2);
1645 }
44f54ceb 1646
bb270c08
DB
1647 height >>= 1;
1648 do { /* 26 cycles */
1649 vis_ld64_2(ref, stride, TMP0);
1650 vis_mul8x16au(REF_S0, CONST_256, TMP8);
1651 vis_pmerge(ZERO, REF_S2, TMP12);
44f54ceb 1652
bb270c08 1653 vis_alignaddr_g0((void *)off);
44f54ceb 1654
bb270c08
DB
1655 vis_ld64_2(ref, stride_8, TMP2);
1656 ref += stride;
1657 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
1658 vis_pmerge(ZERO, REF_S2_1, TMP14);
44f54ceb 1659
bb270c08 1660 vis_ld64_2(ref, stride, TMP4);
44f54ceb 1661
bb270c08
DB
1662 vis_ld64_2(ref, stride_8, TMP6);
1663 ref += stride;
1664 vis_faligndata(TMP0, TMP2, REF_S4);
44f54ceb 1665
bb270c08 1666 vis_pmerge(ZERO, REF_S4, TMP18);
44f54ceb 1667
bb270c08 1668 vis_pmerge(ZERO, REF_S4_1, TMP20);
44f54ceb 1669
bb270c08 1670 vis_faligndata(TMP4, TMP6, REF_S0);
44f54ceb 1671
bb270c08
DB
1672 if (off != 0x7) {
1673 vis_alignaddr_g0((void *)off_plus_1);
1674 vis_faligndata(TMP0, TMP2, REF_S6);
1675 vis_faligndata(TMP4, TMP6, REF_S2);
1676 } else {
1677 vis_src1(TMP2, REF_S6);
1678 vis_src1(TMP6, REF_S2);
1679 }
44f54ceb 1680
bb270c08
DB
1681 vis_padd16(TMP18, CONST_2, TMP18);
1682 vis_mul8x16au(REF_S6, CONST_256, TMP22);
44f54ceb 1683
bb270c08
DB
1684 vis_padd16(TMP20, CONST_2, TMP20);
1685 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
44f54ceb 1686
bb270c08
DB
1687 vis_mul8x16au(REF_S0, CONST_256, TMP26);
1688 vis_pmerge(ZERO, REF_S0_1, TMP28);
44f54ceb 1689
bb270c08
DB
1690 vis_mul8x16au(REF_S2, CONST_256, TMP30);
1691 vis_padd16(TMP18, TMP22, TMP18);
44f54ceb 1692
bb270c08
DB
1693 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
1694 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1695
bb270c08 1696 vis_padd16(TMP8, TMP18, TMP8);
44f54ceb 1697
bb270c08 1698 vis_padd16(TMP10, TMP20, TMP10);
44f54ceb 1699
bb270c08 1700 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1701
bb270c08
DB
1702 vis_padd16(TMP10, TMP14, TMP10);
1703 vis_pack16(TMP8, DST_0);
44f54ceb 1704
bb270c08
DB
1705 vis_pack16(TMP10, DST_1);
1706 vis_st64(DST_0, dest[0]);
1707 dest += stride;
1708 vis_padd16(TMP18, TMP26, TMP18);
44f54ceb 1709
bb270c08 1710 vis_padd16(TMP20, TMP28, TMP20);
44f54ceb 1711
bb270c08 1712 vis_padd16(TMP18, TMP30, TMP18);
44f54ceb 1713
bb270c08
DB
1714 vis_padd16(TMP20, TMP32, TMP20);
1715 vis_pack16(TMP18, DST_2);
44f54ceb 1716
bb270c08
DB
1717 vis_pack16(TMP20, DST_3);
1718 vis_st64(DST_2, dest[0]);
1719 dest += stride;
1720 } while (--height);
44f54ceb
MN
1721}
1722
1723static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1724 const int stride, int height)
44f54ceb 1725{
bb270c08
DB
1726 uint8_t *ref = (uint8_t *) _ref;
1727 unsigned long off = (unsigned long) ref & 0x7;
1728 unsigned long off_plus_1 = off + 1;
1729 int stride_8 = stride + 8;
1730 int stride_16 = stride + 16;
44f54ceb 1731
bb270c08 1732 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1733
bb270c08 1734 ref = vis_alignaddr(ref);
44f54ceb 1735
bb270c08
DB
1736 vis_ld64(ref[ 0], TMP0);
1737 vis_fzero(ZERO);
44f54ceb 1738
bb270c08 1739 vis_ld64(ref[ 8], TMP2);
44f54ceb 1740
bb270c08 1741 vis_ld64(ref[16], TMP4);
44f54ceb 1742
bb270c08
DB
1743 vis_ld64(constants6[0], CONST_6);
1744 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1745
bb270c08
DB
1746 vis_ld64(constants256_1024[0], CONST_256);
1747 vis_faligndata(TMP2, TMP4, REF_S4);
44f54ceb 1748
bb270c08
DB
1749 if (off != 0x7) {
1750 vis_alignaddr_g0((void *)off_plus_1);
1751 vis_faligndata(TMP0, TMP2, REF_S2);
1752 vis_faligndata(TMP2, TMP4, REF_S6);
1753 } else {
1754 vis_src1(TMP2, REF_S2);
1755 vis_src1(TMP4, REF_S6);
1756 }
44f54ceb 1757
bb270c08
DB
1758 height >>= 1;
1759 do { /* 55 cycles */
1760 vis_ld64_2(ref, stride, TMP0);
1761 vis_mul8x16au(REF_S0, CONST_256, TMP12);
1762 vis_pmerge(ZERO, REF_S0_1, TMP14);
44f54ceb 1763
bb270c08 1764 vis_alignaddr_g0((void *)off);
44f54ceb 1765
bb270c08
DB
1766 vis_ld64_2(ref, stride_8, TMP2);
1767 vis_mul8x16au(REF_S2, CONST_256, TMP16);
1768 vis_pmerge(ZERO, REF_S2_1, TMP18);
44f54ceb 1769
bb270c08
DB
1770 vis_ld64_2(ref, stride_16, TMP4);
1771 ref += stride;
1772 vis_mul8x16au(REF_S4, CONST_256, TMP20);
1773 vis_pmerge(ZERO, REF_S4_1, TMP22);
44f54ceb 1774
bb270c08
DB
1775 vis_ld64_2(ref, stride, TMP6);
1776 vis_mul8x16au(REF_S6, CONST_256, TMP24);
1777 vis_pmerge(ZERO, REF_S6_1, TMP26);
44f54ceb 1778
bb270c08
DB
1779 vis_ld64_2(ref, stride_8, TMP8);
1780 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 1781
bb270c08
DB
1782 vis_ld64_2(ref, stride_16, TMP10);
1783 ref += stride;
1784 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 1785
bb270c08
DB
1786 vis_ld64(dest[0], DST_0);
1787 vis_faligndata(TMP6, TMP8, REF_S0);
44f54ceb 1788
bb270c08
DB
1789 vis_ld64_2(dest, 8, DST_2);
1790 vis_faligndata(TMP8, TMP10, REF_S4);
44f54ceb 1791
bb270c08
DB
1792 if (off != 0x7) {
1793 vis_alignaddr_g0((void *)off_plus_1);
1794 vis_faligndata(TMP0, TMP2, REF_2);
1795 vis_faligndata(TMP2, TMP4, REF_6);
1796 vis_faligndata(TMP6, TMP8, REF_S2);
1797 vis_faligndata(TMP8, TMP10, REF_S6);
1798 } else {
1799 vis_src1(TMP2, REF_2);
1800 vis_src1(TMP4, REF_6);
1801 vis_src1(TMP8, REF_S2);
1802 vis_src1(TMP10, REF_S6);
1803 }
44f54ceb 1804
bb270c08
DB
1805 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1806 vis_pmerge(ZERO, REF_0, TMP0);
44f54ceb 1807
bb270c08
DB
1808 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1809 vis_pmerge(ZERO, REF_0_1, TMP2);
44f54ceb 1810
bb270c08
DB
1811 vis_mul8x16au(REF_2, CONST_256, TMP4);
1812 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 1813
bb270c08
DB
1814 vis_mul8x16al(DST_2, CONST_1024, REF_0);
1815 vis_padd16(TMP0, CONST_6, TMP0);
44f54ceb 1816
bb270c08
DB
1817 vis_mul8x16al(DST_3, CONST_1024, REF_2);
1818 vis_padd16(TMP2, CONST_6, TMP2);
44f54ceb 1819
bb270c08
DB
1820 vis_padd16(TMP0, TMP4, TMP0);
1821 vis_mul8x16au(REF_4, CONST_256, TMP4);
44f54ceb 1822
bb270c08
DB
1823 vis_padd16(TMP2, TMP6, TMP2);
1824 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
44f54ceb 1825
bb270c08
DB
1826 vis_padd16(TMP12, TMP0, TMP12);
1827 vis_mul8x16au(REF_6, CONST_256, TMP8);
44f54ceb 1828
bb270c08
DB
1829 vis_padd16(TMP14, TMP2, TMP14);
1830 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
44f54ceb 1831
bb270c08
DB
1832 vis_padd16(TMP12, TMP16, TMP12);
1833 vis_mul8x16au(REF_S0, CONST_256, REF_4);
44f54ceb 1834
bb270c08
DB
1835 vis_padd16(TMP14, TMP18, TMP14);
1836 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
44f54ceb 1837
bb270c08 1838 vis_padd16(TMP12, TMP30, TMP12);
44f54ceb 1839
bb270c08
DB
1840 vis_padd16(TMP14, TMP32, TMP14);
1841 vis_pack16(TMP12, DST_0);
44f54ceb 1842
bb270c08
DB
1843 vis_pack16(TMP14, DST_1);
1844 vis_st64(DST_0, dest[0]);
1845 vis_padd16(TMP4, CONST_6, TMP4);
44f54ceb 1846
bb270c08
DB
1847 vis_ld64_2(dest, stride, DST_0);
1848 vis_padd16(TMP6, CONST_6, TMP6);
1849 vis_mul8x16au(REF_S2, CONST_256, TMP12);
44f54ceb 1850
bb270c08
DB
1851 vis_padd16(TMP4, TMP8, TMP4);
1852 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
44f54ceb 1853
bb270c08 1854 vis_padd16(TMP6, TMP10, TMP6);
44f54ceb 1855
bb270c08 1856 vis_padd16(TMP20, TMP4, TMP20);
44f54ceb 1857
bb270c08 1858 vis_padd16(TMP22, TMP6, TMP22);
44f54ceb 1859
bb270c08 1860 vis_padd16(TMP20, TMP24, TMP20);
44f54ceb 1861
bb270c08 1862 vis_padd16(TMP22, TMP26, TMP22);
44f54ceb 1863
bb270c08
DB
1864 vis_padd16(TMP20, REF_0, TMP20);
1865 vis_mul8x16au(REF_S4, CONST_256, REF_0);
44f54ceb 1866
bb270c08
DB
1867 vis_padd16(TMP22, REF_2, TMP22);
1868 vis_pack16(TMP20, DST_2);
44f54ceb 1869
bb270c08
DB
1870 vis_pack16(TMP22, DST_3);
1871 vis_st64_2(DST_2, dest, 8);
1872 dest += stride;
44f54ceb 1873
bb270c08
DB
1874 vis_ld64_2(dest, 8, DST_2);
1875 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1876 vis_pmerge(ZERO, REF_S4_1, REF_2);
44f54ceb 1877
bb270c08
DB
1878 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1879 vis_padd16(REF_4, TMP0, TMP8);
44f54ceb 1880
bb270c08
DB
1881 vis_mul8x16au(REF_S6, CONST_256, REF_4);
1882 vis_padd16(REF_6, TMP2, TMP10);
44f54ceb 1883
bb270c08
DB
1884 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
1885 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 1886
bb270c08 1887 vis_padd16(TMP10, TMP14, TMP10);
44f54ceb 1888
bb270c08 1889 vis_padd16(TMP8, TMP30, TMP8);
44f54ceb 1890
bb270c08
DB
1891 vis_padd16(TMP10, TMP32, TMP10);
1892 vis_pack16(TMP8, DST_0);
44f54ceb 1893
bb270c08
DB
1894 vis_pack16(TMP10, DST_1);
1895 vis_st64(DST_0, dest[0]);
44f54ceb 1896
bb270c08 1897 vis_padd16(REF_0, TMP4, REF_0);
44f54ceb 1898
bb270c08
DB
1899 vis_mul8x16al(DST_2, CONST_1024, TMP30);
1900 vis_padd16(REF_2, TMP6, REF_2);
44f54ceb 1901
bb270c08
DB
1902 vis_mul8x16al(DST_3, CONST_1024, TMP32);
1903 vis_padd16(REF_0, REF_4, REF_0);
44f54ceb 1904
bb270c08 1905 vis_padd16(REF_2, REF_6, REF_2);
44f54ceb 1906
bb270c08 1907 vis_padd16(REF_0, TMP30, REF_0);
44f54ceb 1908
bb270c08 1909 /* stall */
44f54ceb 1910
bb270c08
DB
1911 vis_padd16(REF_2, TMP32, REF_2);
1912 vis_pack16(REF_0, DST_2);
44f54ceb 1913
bb270c08
DB
1914 vis_pack16(REF_2, DST_3);
1915 vis_st64_2(DST_2, dest, 8);
1916 dest += stride;
1917 } while (--height);
44f54ceb
MN
1918}
1919
1920static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 1921 const int stride, int height)
44f54ceb 1922{
bb270c08
DB
1923 uint8_t *ref = (uint8_t *) _ref;
1924 unsigned long off = (unsigned long) ref & 0x7;
1925 unsigned long off_plus_1 = off + 1;
1926 int stride_8 = stride + 8;
44f54ceb 1927
bb270c08 1928 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 1929
bb270c08 1930 ref = vis_alignaddr(ref);
44f54ceb 1931
bb270c08
DB
1932 vis_ld64(ref[0], TMP0);
1933 vis_fzero(ZERO);
44f54ceb 1934
bb270c08 1935 vis_ld64_2(ref, 8, TMP2);
44f54ceb 1936
bb270c08 1937 vis_ld64(constants6[0], CONST_6);
44f54ceb 1938
bb270c08
DB
1939 vis_ld64(constants256_1024[0], CONST_256);
1940 vis_faligndata(TMP0, TMP2, REF_S0);
44f54ceb 1941
bb270c08
DB
1942 if (off != 0x7) {
1943 vis_alignaddr_g0((void *)off_plus_1);
1944 vis_faligndata(TMP0, TMP2, REF_S2);
1945 } else {
1946 vis_src1(TMP2, REF_S2);
1947 }
44f54ceb 1948
bb270c08
DB
1949 height >>= 1;
1950 do { /* 31 cycles */
1951 vis_ld64_2(ref, stride, TMP0);
1952 vis_mul8x16au(REF_S0, CONST_256, TMP8);
1953 vis_pmerge(ZERO, REF_S0_1, TMP10);
44f54ceb 1954
bb270c08
DB
1955 vis_ld64_2(ref, stride_8, TMP2);
1956 ref += stride;
1957 vis_mul8x16au(REF_S2, CONST_256, TMP12);
1958 vis_pmerge(ZERO, REF_S2_1, TMP14);
44f54ceb 1959
bb270c08 1960 vis_alignaddr_g0((void *)off);
44f54ceb 1961
bb270c08
DB
1962 vis_ld64_2(ref, stride, TMP4);
1963 vis_faligndata(TMP0, TMP2, REF_S4);
44f54ceb 1964
bb270c08
DB
1965 vis_ld64_2(ref, stride_8, TMP6);
1966 ref += stride;
44f54ceb 1967
bb270c08
DB
1968 vis_ld64(dest[0], DST_0);
1969 vis_faligndata(TMP4, TMP6, REF_S0);
44f54ceb 1970
bb270c08 1971 vis_ld64_2(dest, stride, DST_2);
44f54ceb 1972
bb270c08
DB
1973 if (off != 0x7) {
1974 vis_alignaddr_g0((void *)off_plus_1);
1975 vis_faligndata(TMP0, TMP2, REF_S6);
1976 vis_faligndata(TMP4, TMP6, REF_S2);
1977 } else {
1978 vis_src1(TMP2, REF_S6);
1979 vis_src1(TMP6, REF_S2);
1980 }
44f54ceb 1981
bb270c08
DB
1982 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1983 vis_pmerge(ZERO, REF_S4, TMP22);
44f54ceb 1984
bb270c08
DB
1985 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1986 vis_pmerge(ZERO, REF_S4_1, TMP24);
44f54ceb 1987
bb270c08
DB
1988 vis_mul8x16au(REF_S6, CONST_256, TMP26);
1989 vis_pmerge(ZERO, REF_S6_1, TMP28);
44f54ceb 1990
bb270c08
DB
1991 vis_mul8x16au(REF_S0, CONST_256, REF_S4);
1992 vis_padd16(TMP22, CONST_6, TMP22);
44f54ceb 1993
bb270c08
DB
1994 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
1995 vis_padd16(TMP24, CONST_6, TMP24);
44f54ceb 1996
bb270c08
DB
1997 vis_mul8x16al(DST_2, CONST_1024, REF_0);
1998 vis_padd16(TMP22, TMP26, TMP22);
44f54ceb 1999
bb270c08
DB
2000 vis_mul8x16al(DST_3, CONST_1024, REF_2);
2001 vis_padd16(TMP24, TMP28, TMP24);
44f54ceb 2002
bb270c08
DB
2003 vis_mul8x16au(REF_S2, CONST_256, TMP26);
2004 vis_padd16(TMP8, TMP22, TMP8);
44f54ceb 2005
bb270c08
DB
2006 vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
2007 vis_padd16(TMP10, TMP24, TMP10);
44f54ceb 2008
bb270c08 2009 vis_padd16(TMP8, TMP12, TMP8);
44f54ceb 2010
bb270c08 2011 vis_padd16(TMP10, TMP14, TMP10);
44f54ceb 2012
bb270c08 2013 vis_padd16(TMP8, TMP30, TMP8);
44f54ceb 2014
bb270c08
DB
2015 vis_padd16(TMP10, TMP32, TMP10);
2016 vis_pack16(TMP8, DST_0);
44f54ceb 2017
bb270c08
DB
2018 vis_pack16(TMP10, DST_1);
2019 vis_st64(DST_0, dest[0]);
2020 dest += stride;
44f54ceb 2021
bb270c08 2022 vis_padd16(REF_S4, TMP22, TMP12);
44f54ceb 2023
bb270c08 2024 vis_padd16(REF_S6, TMP24, TMP14);
44f54ceb 2025
bb270c08 2026 vis_padd16(TMP12, TMP26, TMP12);
44f54ceb 2027
bb270c08 2028 vis_padd16(TMP14, TMP28, TMP14);
44f54ceb 2029
bb270c08 2030 vis_padd16(TMP12, REF_0, TMP12);
44f54ceb 2031
bb270c08
DB
2032 vis_padd16(TMP14, REF_2, TMP14);
2033 vis_pack16(TMP12, DST_2);
44f54ceb 2034
bb270c08
DB
2035 vis_pack16(TMP14, DST_3);
2036 vis_st64(DST_2, dest[0]);
2037 dest += stride;
2038 } while (--height);
44f54ceb
MN
2039}
2040
2041/* End of rounding code */
2042
2043/* Start of no rounding code */
2044/* The trick used in some of this file is the formula from the MMX
2045 * motion comp code, which is:
2046 *
2047 * (x+y)>>1 == (x&y)+((x^y)>>1)
2048 *
2049 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
2050 * We avoid overflows by masking before we do the shift, and we
2051 * implement the shift by multiplying by 1/2 using mul8x16. So in
2052 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
2053 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
2054 * the value 0x80808080 is in f8):
2055 *
bb270c08
DB
2056 * fxor f0, f2, f10
2057 * fand f10, f4, f10
2058 * fmul8x16 f8, f10, f10
2059 * fand f10, f6, f10
2060 * fand f0, f2, f12
2061 * fpadd16 f12, f10, f10
44f54ceb
MN
2062 */
2063
2064static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2065 const int stride, int height)
44f54ceb 2066{
bb270c08 2067 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 2068
bb270c08
DB
2069 ref = vis_alignaddr(ref);
2070 do { /* 5 cycles */
2071 vis_ld64(ref[0], TMP0);
44f54ceb 2072
bb270c08 2073 vis_ld64_2(ref, 8, TMP2);
44f54ceb 2074
bb270c08
DB
2075 vis_ld64_2(ref, 16, TMP4);
2076 ref += stride;
44f54ceb 2077
bb270c08
DB
2078 vis_faligndata(TMP0, TMP2, REF_0);
2079 vis_st64(REF_0, dest[0]);
44f54ceb 2080
bb270c08
DB
2081 vis_faligndata(TMP2, TMP4, REF_2);
2082 vis_st64_2(REF_2, dest, 8);
2083 dest += stride;
2084 } while (--height);
44f54ceb
MN
2085}
2086
2087static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2088 const int stride, int height)
44f54ceb 2089{
bb270c08 2090 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 2091
bb270c08
DB
2092 ref = vis_alignaddr(ref);
2093 do { /* 4 cycles */
2094 vis_ld64(ref[0], TMP0);
44f54ceb 2095
bb270c08
DB
2096 vis_ld64(ref[8], TMP2);
2097 ref += stride;
44f54ceb 2098
bb270c08 2099 /* stall */
44f54ceb 2100
bb270c08
DB
2101 vis_faligndata(TMP0, TMP2, REF_0);
2102 vis_st64(REF_0, dest[0]);
2103 dest += stride;
2104 } while (--height);
44f54ceb
MN
2105}
2106
2107
2108static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2109 const int stride, int height)
44f54ceb 2110{
bb270c08
DB
2111 uint8_t *ref = (uint8_t *) _ref;
2112 int stride_8 = stride + 8;
44f54ceb 2113
bb270c08 2114 ref = vis_alignaddr(ref);
44f54ceb 2115
bb270c08 2116 vis_ld64(ref[0], TMP0);
44f54ceb 2117
bb270c08 2118 vis_ld64(ref[8], TMP2);
44f54ceb 2119
bb270c08 2120 vis_ld64(ref[16], TMP4);
44f54ceb 2121
bb270c08 2122 vis_ld64(dest[0], DST_0);
44f54ceb 2123
bb270c08 2124 vis_ld64(dest[8], DST_2);
44f54ceb 2125
bb270c08
DB
2126 vis_ld64(constants_fe[0], MASK_fe);
2127 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2128
bb270c08
DB
2129 vis_ld64(constants_7f[0], MASK_7f);
2130 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 2131
bb270c08 2132 vis_ld64(constants128[0], CONST_128);
44f54ceb 2133
bb270c08
DB
2134 ref += stride;
2135 height = (height >> 1) - 1;
44f54ceb 2136
bb270c08
DB
2137 do { /* 24 cycles */
2138 vis_ld64(ref[0], TMP0);
2139 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 2140
bb270c08
DB
2141 vis_ld64_2(ref, 8, TMP2);
2142 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2143
bb270c08
DB
2144 vis_ld64_2(ref, 16, TMP4);
2145 ref += stride;
2146 vis_mul8x16(CONST_128, TMP6, TMP6);
2147 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 2148
bb270c08 2149 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2150
bb270c08
DB
2151 vis_and(DST_0, REF_0, TMP10);
2152 vis_ld64_2(dest, stride, DST_0);
2153 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 2154
bb270c08
DB
2155 vis_and(DST_2, REF_2, TMP12);
2156 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 2157
bb270c08
DB
2158 vis_ld64(ref[0], TMP14);
2159 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2160
bb270c08 2161 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2162
bb270c08
DB
2163 vis_padd16(TMP10, TMP6, TMP6);
2164 vis_st64(TMP6, dest[0]);
44f54ceb 2165
bb270c08
DB
2166 vis_padd16(TMP12, TMP8, TMP8);
2167 vis_st64_2(TMP8, dest, 8);
44f54ceb 2168
bb270c08
DB
2169 dest += stride;
2170 vis_ld64_2(ref, 8, TMP16);
2171 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2172
bb270c08
DB
2173 vis_ld64_2(ref, 16, TMP18);
2174 vis_faligndata(TMP2, TMP4, REF_2);
2175 ref += stride;
44f54ceb 2176
bb270c08 2177 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 2178
bb270c08 2179 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 2180
bb270c08
DB
2181 vis_xor(DST_2, REF_2, TMP22);
2182 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 2183
bb270c08 2184 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 2185
bb270c08
DB
2186 vis_and(DST_0, REF_0, TMP24);
2187 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 2188
bb270c08 2189 vis_and(DST_2, REF_2, TMP26);
44f54ceb 2190
bb270c08
DB
2191 vis_ld64_2(dest, stride, DST_0);
2192 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 2193
bb270c08
DB
2194 vis_ld64_2(dest, stride_8, DST_2);
2195 vis_faligndata(TMP16, TMP18, REF_2);
44f54ceb 2196
bb270c08 2197 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 2198
bb270c08 2199 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 2200
bb270c08
DB
2201 vis_padd16(TMP24, TMP20, TMP20);
2202 vis_st64(TMP20, dest[0]);
44f54ceb 2203
bb270c08
DB
2204 vis_padd16(TMP26, TMP22, TMP22);
2205 vis_st64_2(TMP22, dest, 8);
2206 dest += stride;
2207 } while (--height);
44f54ceb 2208
bb270c08
DB
2209 vis_ld64(ref[0], TMP0);
2210 vis_xor(DST_0, REF_0, TMP6);
44f54ceb 2211
bb270c08
DB
2212 vis_ld64_2(ref, 8, TMP2);
2213 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2214
bb270c08
DB
2215 vis_ld64_2(ref, 16, TMP4);
2216 vis_mul8x16(CONST_128, TMP6, TMP6);
2217 vis_xor(DST_2, REF_2, TMP8);
44f54ceb 2218
bb270c08 2219 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2220
bb270c08
DB
2221 vis_and(DST_0, REF_0, TMP10);
2222 vis_ld64_2(dest, stride, DST_0);
2223 vis_mul8x16(CONST_128, TMP8, TMP8);
44f54ceb 2224
bb270c08
DB
2225 vis_and(DST_2, REF_2, TMP12);
2226 vis_ld64_2(dest, stride_8, DST_2);
44f54ceb 2227
bb270c08
DB
2228 vis_ld64(ref[0], TMP14);
2229 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2230
bb270c08 2231 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2232
bb270c08
DB
2233 vis_padd16(TMP10, TMP6, TMP6);
2234 vis_st64(TMP6, dest[0]);
44f54ceb 2235
bb270c08
DB
2236 vis_padd16(TMP12, TMP8, TMP8);
2237 vis_st64_2(TMP8, dest, 8);
44f54ceb 2238
bb270c08
DB
2239 dest += stride;
2240 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2241
bb270c08 2242 vis_faligndata(TMP2, TMP4, REF_2);
44f54ceb 2243
bb270c08 2244 vis_xor(DST_0, REF_0, TMP20);
44f54ceb 2245
bb270c08 2246 vis_and(TMP20, MASK_fe, TMP20);
44f54ceb 2247
bb270c08
DB
2248 vis_xor(DST_2, REF_2, TMP22);
2249 vis_mul8x16(CONST_128, TMP20, TMP20);
44f54ceb 2250
bb270c08 2251 vis_and(TMP22, MASK_fe, TMP22);
44f54ceb 2252
bb270c08
DB
2253 vis_and(DST_0, REF_0, TMP24);
2254 vis_mul8x16(CONST_128, TMP22, TMP22);
44f54ceb 2255
bb270c08 2256 vis_and(DST_2, REF_2, TMP26);
44f54ceb 2257
bb270c08 2258 vis_and(TMP20, MASK_7f, TMP20);
44f54ceb 2259
bb270c08 2260 vis_and(TMP22, MASK_7f, TMP22);
44f54ceb 2261
bb270c08
DB
2262 vis_padd16(TMP24, TMP20, TMP20);
2263 vis_st64(TMP20, dest[0]);
44f54ceb 2264
bb270c08
DB
2265 vis_padd16(TMP26, TMP22, TMP22);
2266 vis_st64_2(TMP22, dest, 8);
44f54ceb
MN
2267}
2268
2269static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2270 const int stride, int height)
44f54ceb 2271{
bb270c08 2272 uint8_t *ref = (uint8_t *) _ref;
44f54ceb 2273
bb270c08 2274 ref = vis_alignaddr(ref);
44f54ceb 2275
bb270c08 2276 vis_ld64(ref[0], TMP0);
44f54ceb 2277
bb270c08 2278 vis_ld64(ref[8], TMP2);
44f54ceb 2279
bb270c08 2280 vis_ld64(dest[0], DST_0);
44f54ceb 2281
bb270c08 2282 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2283
bb270c08
DB
2284 vis_ld64(constants_7f[0], MASK_7f);
2285 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2286
bb270c08 2287 vis_ld64(constants128[0], CONST_128);
44f54ceb 2288
bb270c08
DB
2289 ref += stride;
2290 height = (height >> 1) - 1;
44f54ceb 2291
bb270c08
DB
2292 do { /* 12 cycles */
2293 vis_ld64(ref[0], TMP0);
2294 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 2295
bb270c08
DB
2296 vis_ld64(ref[8], TMP2);
2297 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2298
bb270c08
DB
2299 vis_and(DST_0, REF_0, TMP6);
2300 vis_ld64_2(dest, stride, DST_0);
2301 ref += stride;
2302 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2303
bb270c08
DB
2304 vis_ld64(ref[0], TMP12);
2305 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2306
bb270c08
DB
2307 vis_ld64(ref[8], TMP2);
2308 vis_xor(DST_0, REF_0, TMP0);
2309 ref += stride;
44f54ceb 2310
bb270c08 2311 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 2312
bb270c08 2313 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2314
bb270c08
DB
2315 vis_padd16(TMP6, TMP4, TMP4);
2316 vis_st64(TMP4, dest[0]);
2317 dest += stride;
2318 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 2319
bb270c08
DB
2320 vis_and(DST_0, REF_0, TMP6);
2321 vis_ld64_2(dest, stride, DST_0);
44f54ceb 2322
bb270c08 2323 vis_faligndata(TMP12, TMP2, REF_0);
44f54ceb 2324
bb270c08 2325 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 2326
bb270c08
DB
2327 vis_padd16(TMP6, TMP0, TMP4);
2328 vis_st64(TMP4, dest[0]);
2329 dest += stride;
2330 } while (--height);
44f54ceb 2331
bb270c08
DB
2332 vis_ld64(ref[0], TMP0);
2333 vis_xor(DST_0, REF_0, TMP4);
44f54ceb 2334
bb270c08
DB
2335 vis_ld64(ref[8], TMP2);
2336 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2337
bb270c08
DB
2338 vis_and(DST_0, REF_0, TMP6);
2339 vis_ld64_2(dest, stride, DST_0);
2340 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2341
bb270c08 2342 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2343
bb270c08 2344 vis_xor(DST_0, REF_0, TMP0);
44f54ceb 2345
bb270c08 2346 vis_and(TMP0, MASK_fe, TMP0);
44f54ceb 2347
bb270c08 2348 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2349
bb270c08
DB
2350 vis_padd16(TMP6, TMP4, TMP4);
2351 vis_st64(TMP4, dest[0]);
2352 dest += stride;
2353 vis_mul8x16(CONST_128, TMP0, TMP0);
44f54ceb 2354
bb270c08 2355 vis_and(DST_0, REF_0, TMP6);
44f54ceb 2356
bb270c08 2357 vis_and(TMP0, MASK_7f, TMP0);
44f54ceb 2358
bb270c08
DB
2359 vis_padd16(TMP6, TMP0, TMP4);
2360 vis_st64(TMP4, dest[0]);
44f54ceb
MN
2361}
2362
2363static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2364 const int stride, int height)
44f54ceb 2365{
bb270c08
DB
2366 uint8_t *ref = (uint8_t *) _ref;
2367 unsigned long off = (unsigned long) ref & 0x7;
2368 unsigned long off_plus_1 = off + 1;
44f54ceb 2369
bb270c08 2370 ref = vis_alignaddr(ref);
44f54ceb 2371
bb270c08 2372 vis_ld64(ref[0], TMP0);
44f54ceb 2373
bb270c08 2374 vis_ld64_2(ref, 8, TMP2);
44f54ceb 2375
bb270c08 2376 vis_ld64_2(ref, 16, TMP4);
44f54ceb 2377
bb270c08 2378 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2379
bb270c08
DB
2380 vis_ld64(constants_7f[0], MASK_7f);
2381 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2382
bb270c08
DB
2383 vis_ld64(constants128[0], CONST_128);
2384 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2385
bb270c08
DB
2386 if (off != 0x7) {
2387 vis_alignaddr_g0((void *)off_plus_1);
2388 vis_faligndata(TMP0, TMP2, REF_2);
2389 vis_faligndata(TMP2, TMP4, REF_6);
2390 } else {
2391 vis_src1(TMP2, REF_2);
2392 vis_src1(TMP4, REF_6);
2393 }
44f54ceb 2394
bb270c08
DB
2395 ref += stride;
2396 height = (height >> 1) - 1;
44f54ceb 2397
bb270c08
DB
2398 do { /* 34 cycles */
2399 vis_ld64(ref[0], TMP0);
2400 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2401
bb270c08
DB
2402 vis_ld64_2(ref, 8, TMP2);
2403 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2404
bb270c08
DB
2405 vis_ld64_2(ref, 16, TMP4);
2406 vis_and(TMP6, MASK_fe, TMP6);
2407 ref += stride;
44f54ceb 2408
bb270c08
DB
2409 vis_ld64(ref[0], TMP14);
2410 vis_mul8x16(CONST_128, TMP6, TMP6);
2411 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2412
bb270c08
DB
2413 vis_ld64_2(ref, 8, TMP16);
2414 vis_mul8x16(CONST_128, TMP8, TMP8);
2415 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2416
bb270c08
DB
2417 vis_ld64_2(ref, 16, TMP18);
2418 ref += stride;
2419 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2420
bb270c08 2421 vis_alignaddr_g0((void *)off);
44f54ceb 2422
bb270c08 2423 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2424
bb270c08 2425 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2426
bb270c08
DB
2427 if (off != 0x7) {
2428 vis_alignaddr_g0((void *)off_plus_1);
2429 vis_faligndata(TMP0, TMP2, REF_2);
2430 vis_faligndata(TMP2, TMP4, REF_6);
2431 } else {
2432 vis_src1(TMP2, REF_2);
2433 vis_src1(TMP4, REF_6);
2434 }
44f54ceb 2435
bb270c08 2436 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2437
bb270c08 2438 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2439
bb270c08
DB
2440 vis_padd16(TMP10, TMP6, TMP6);
2441 vis_st64(TMP6, dest[0]);
44f54ceb 2442
bb270c08
DB
2443 vis_padd16(TMP12, TMP8, TMP8);
2444 vis_st64_2(TMP8, dest, 8);
2445 dest += stride;
44f54ceb 2446
bb270c08 2447 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2448
bb270c08 2449 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2450
bb270c08 2451 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2452
bb270c08
DB
2453 vis_mul8x16(CONST_128, TMP6, TMP6);
2454 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2455
bb270c08
DB
2456 vis_mul8x16(CONST_128, TMP8, TMP8);
2457 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2458
bb270c08 2459 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2460
bb270c08 2461 vis_alignaddr_g0((void *)off);
44f54ceb 2462
bb270c08 2463 vis_faligndata(TMP14, TMP16, REF_0);
44f54ceb 2464
bb270c08 2465 vis_faligndata(TMP16, TMP18, REF_4);
44f54ceb 2466
bb270c08
DB
2467 if (off != 0x7) {
2468 vis_alignaddr_g0((void *)off_plus_1);
2469 vis_faligndata(TMP14, TMP16, REF_2);
2470 vis_faligndata(TMP16, TMP18, REF_6);
2471 } else {
2472 vis_src1(TMP16, REF_2);
2473 vis_src1(TMP18, REF_6);
2474 }
44f54ceb 2475
bb270c08 2476 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2477
bb270c08 2478 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2479
bb270c08
DB
2480 vis_padd16(TMP10, TMP6, TMP6);
2481 vis_st64(TMP6, dest[0]);
44f54ceb 2482
bb270c08
DB
2483 vis_padd16(TMP12, TMP8, TMP8);
2484 vis_st64_2(TMP8, dest, 8);
2485 dest += stride;
2486 } while (--height);
44f54ceb 2487
bb270c08
DB
2488 vis_ld64(ref[0], TMP0);
2489 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2490
bb270c08
DB
2491 vis_ld64_2(ref, 8, TMP2);
2492 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2493
bb270c08
DB
2494 vis_ld64_2(ref, 16, TMP4);
2495 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2496
bb270c08
DB
2497 vis_mul8x16(CONST_128, TMP6, TMP6);
2498 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2499
bb270c08
DB
2500 vis_mul8x16(CONST_128, TMP8, TMP8);
2501 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2502
bb270c08 2503 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2504
bb270c08 2505 vis_alignaddr_g0((void *)off);
44f54ceb 2506
bb270c08 2507 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2508
bb270c08 2509 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2510
bb270c08
DB
2511 if (off != 0x7) {
2512 vis_alignaddr_g0((void *)off_plus_1);
2513 vis_faligndata(TMP0, TMP2, REF_2);
2514 vis_faligndata(TMP2, TMP4, REF_6);
2515 } else {
2516 vis_src1(TMP2, REF_2);
2517 vis_src1(TMP4, REF_6);
2518 }
44f54ceb 2519
bb270c08 2520 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2521
bb270c08 2522 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2523
bb270c08
DB
2524 vis_padd16(TMP10, TMP6, TMP6);
2525 vis_st64(TMP6, dest[0]);
44f54ceb 2526
bb270c08
DB
2527 vis_padd16(TMP12, TMP8, TMP8);
2528 vis_st64_2(TMP8, dest, 8);
2529 dest += stride;
44f54ceb 2530
bb270c08 2531 vis_xor(REF_0, REF_2, TMP6);
44f54ceb 2532
bb270c08 2533 vis_xor(REF_4, REF_6, TMP8);
44f54ceb 2534
bb270c08 2535 vis_and(TMP6, MASK_fe, TMP6);
44f54ceb 2536
bb270c08
DB
2537 vis_mul8x16(CONST_128, TMP6, TMP6);
2538 vis_and(TMP8, MASK_fe, TMP8);
44f54ceb 2539
bb270c08
DB
2540 vis_mul8x16(CONST_128, TMP8, TMP8);
2541 vis_and(REF_0, REF_2, TMP10);
44f54ceb 2542
bb270c08 2543 vis_and(REF_4, REF_6, TMP12);
44f54ceb 2544
bb270c08 2545 vis_and(TMP6, MASK_7f, TMP6);
44f54ceb 2546
bb270c08 2547 vis_and(TMP8, MASK_7f, TMP8);
44f54ceb 2548
bb270c08
DB
2549 vis_padd16(TMP10, TMP6, TMP6);
2550 vis_st64(TMP6, dest[0]);
44f54ceb 2551
bb270c08
DB
2552 vis_padd16(TMP12, TMP8, TMP8);
2553 vis_st64_2(TMP8, dest, 8);
44f54ceb
MN
2554}
2555
2556static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2557 const int stride, int height)
44f54ceb 2558{
bb270c08
DB
2559 uint8_t *ref = (uint8_t *) _ref;
2560 unsigned long off = (unsigned long) ref & 0x7;
2561 unsigned long off_plus_1 = off + 1;
44f54ceb 2562
bb270c08 2563 ref = vis_alignaddr(ref);
44f54ceb 2564
bb270c08 2565 vis_ld64(ref[0], TMP0);
44f54ceb 2566
bb270c08 2567 vis_ld64(ref[8], TMP2);
44f54ceb 2568
bb270c08 2569 vis_ld64(constants_fe[0], MASK_fe);
44f54ceb 2570
bb270c08 2571 vis_ld64(constants_7f[0], MASK_7f);
44f54ceb 2572
bb270c08
DB
2573 vis_ld64(constants128[0], CONST_128);
2574 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2575
bb270c08
DB
2576 if (off != 0x7) {
2577 vis_alignaddr_g0((void *)off_plus_1);
2578 vis_faligndata(TMP0, TMP2, REF_2);
2579 } else {
2580 vis_src1(TMP2, REF_2);
2581 }
44f54ceb 2582
bb270c08
DB
2583 ref += stride;
2584 height = (height >> 1) - 1;
44f54ceb 2585
bb270c08
DB
2586 do { /* 20 cycles */
2587 vis_ld64(ref[0], TMP0);
2588 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 2589
bb270c08
DB
2590 vis_ld64_2(ref, 8, TMP2);
2591 vis_and(TMP4, MASK_fe, TMP4);
2592 ref += stride;
44f54ceb 2593
bb270c08
DB
2594 vis_ld64(ref[0], TMP8);
2595 vis_and(REF_0, REF_2, TMP6);
2596 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2597
bb270c08 2598 vis_alignaddr_g0((void *)off);
44f54ceb 2599
bb270c08
DB
2600 vis_ld64_2(ref, 8, TMP10);
2601 ref += stride;
2602 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2603
bb270c08
DB
2604 if (off != 0x7) {
2605 vis_alignaddr_g0((void *)off_plus_1);
2606 vis_faligndata(TMP0, TMP2, REF_2);
2607 } else {
2608 vis_src1(TMP2, REF_2);
2609 }
44f54ceb 2610
bb270c08 2611 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2612
bb270c08
DB
2613 vis_padd16(TMP6, TMP4, DST_0);
2614 vis_st64(DST_0, dest[0]);
2615 dest += stride;
44f54ceb 2616
bb270c08 2617 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 2618
bb270c08 2619 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 2620
bb270c08
DB
2621 vis_and(REF_0, REF_2, TMP14);
2622 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 2623
bb270c08
DB
2624 vis_alignaddr_g0((void *)off);
2625 vis_faligndata(TMP8, TMP10, REF_0);
2626 if (off != 0x7) {
2627 vis_alignaddr_g0((void *)off_plus_1);
2628 vis_faligndata(TMP8, TMP10, REF_2);
2629 } else {
2630 vis_src1(TMP10, REF_2);
2631 }
44f54ceb 2632
bb270c08 2633 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 2634
bb270c08
DB
2635 vis_padd16(TMP14, TMP12, DST_0);
2636 vis_st64(DST_0, dest[0]);
2637 dest += stride;
2638 } while (--height);
44f54ceb 2639
bb270c08
DB
2640 vis_ld64(ref[0], TMP0);
2641 vis_xor(REF_0, REF_2, TMP4);
44f54ceb 2642
bb270c08
DB
2643 vis_ld64_2(ref, 8, TMP2);
2644 vis_and(TMP4, MASK_fe, TMP4);
44f54ceb 2645
bb270c08
DB
2646 vis_and(REF_0, REF_2, TMP6);
2647 vis_mul8x16(CONST_128, TMP4, TMP4);
44f54ceb 2648
bb270c08 2649 vis_alignaddr_g0((void *)off);
44f54ceb 2650
bb270c08 2651 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2652
bb270c08
DB
2653 if (off != 0x7) {
2654 vis_alignaddr_g0((void *)off_plus_1);
2655 vis_faligndata(TMP0, TMP2, REF_2);
2656 } else {
2657 vis_src1(TMP2, REF_2);
2658 }
44f54ceb 2659
bb270c08 2660 vis_and(TMP4, MASK_7f, TMP4);
44f54ceb 2661
bb270c08
DB
2662 vis_padd16(TMP6, TMP4, DST_0);
2663 vis_st64(DST_0, dest[0]);
2664 dest += stride;
44f54ceb 2665
bb270c08 2666 vis_xor(REF_0, REF_2, TMP12);
44f54ceb 2667
bb270c08 2668 vis_and(TMP12, MASK_fe, TMP12);
44f54ceb 2669
bb270c08
DB
2670 vis_and(REF_0, REF_2, TMP14);
2671 vis_mul8x16(CONST_128, TMP12, TMP12);
44f54ceb 2672
bb270c08 2673 vis_and(TMP12, MASK_7f, TMP12);
44f54ceb 2674
bb270c08
DB
2675 vis_padd16(TMP14, TMP12, DST_0);
2676 vis_st64(DST_0, dest[0]);
2677 dest += stride;
44f54ceb
MN
2678}
2679
2680static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
bb270c08 2681 const int stride, int height)
44f54ceb 2682{
bb270c08
DB
2683 uint8_t *ref = (uint8_t *) _ref;
2684 unsigned long off = (unsigned long) ref & 0x7;
2685 unsigned long off_plus_1 = off + 1;
44f54ceb 2686
bb270c08 2687 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
44f54ceb 2688
bb270c08
DB
2689 vis_ld64(constants3[0], CONST_3);
2690 vis_fzero(ZERO);
2691 vis_ld64(constants256_512[0], CONST_256);
44f54ceb 2692
bb270c08
DB
2693 ref = vis_alignaddr(ref);
2694 do { /* 26 cycles */
2695 vis_ld64(ref[0], TMP0);
44f54ceb 2696
bb270c08 2697 vis_ld64(ref[8], TMP2);
44f54ceb 2698
bb270c08 2699 vis_alignaddr_g0((void *)off);
44f54ceb 2700
bb270c08 2701 vis_ld64(ref[16], TMP4);
44f54ceb 2702
bb270c08
DB
2703 vis_ld64(dest[0], DST_0);
2704 vis_faligndata(TMP0, TMP2, REF_0);
44f54ceb 2705
bb270c08
DB
2706 vis_ld64(dest[8], DST_2);
2707 vis_faligndata(TMP2, TMP4, REF_4);
44f54ceb 2708
bb270c08
DB
2709 if (off != 0x7) {
2710 vis_alignaddr_g0((void *)off_plus_1);
2711 vis_faligndata(TMP0, TMP2, REF_2);
2712 vis_faligndata(TMP2, TMP4, REF_6);
2713 } else {
2714 vis_src1(TMP2, REF_2);
2715 vis_src1(TMP4, REF_6);
2716 }
44f54ceb 2717
bb270c08 2718 vis_mul8x16au(REF_0, CONST_256, TMP0);
44f54ceb 2719
bb270c08
DB
2720 vis_pmerge(ZERO, REF_2, TMP4);
2721 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
44f54ceb 2722
bb270c08 2723 vis_pmerge(ZERO, REF_2_1, TMP6);
44f54ceb 2724
bb270c08 2725 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 2726
bb270c08
DB
2727 vis_mul8x16al(DST_0, CONST_512, TMP4);
2728 vis_padd16(TMP2, TMP6, TMP2);
44f54ceb 2729
bb270c08 2730 vis_mul8x16al(DST_1, CONST_512, TMP6);
44f54ceb 2731
bb270c08 2732 vis_mul8x16au(REF_6, CONST_256, TMP12);
44f54ceb 2733
bb270c08
DB
2734 vis_padd16(TMP0, TMP4, TMP0);
2735 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
44f54ceb 2736
bb270c08
DB
2737 vis_padd16(TMP2, TMP6, TMP2);
2738 vis_mul8x16au(REF_4, CONST_256, TMP16);
44f54ceb 2739
bb270c08
DB
2740 vis_padd16(TMP0, CONST_3, TMP8);
2741 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
44f54ceb 2742
bb270c08
DB
2743 vis_padd16(TMP2, CONST_3, TMP10);
2744 vis_pack16(TMP8, DST_0);
44f54ceb 2745
bb270c08
DB
2746 vis_pack16(TMP10, DST_1);
2747 vis_padd16(TMP16, TMP12, TMP0);
44f54ceb 2748
bb270c08
DB
2749 vis_st64(DST_0, dest[0]);
2750 vis_mul8x16al(DST_2, CONST_512, TMP4);
2751 vis_padd16(TMP18, TMP14, TMP2);
44f54ceb 2752
bb270c08
DB
2753 vis_mul8x16al(DST_3, CONST_512, TMP6);
2754 vis_padd16(TMP0, CONST_3, TMP0);
44f54ceb 2755
bb270c08 2756 vis_padd16(TMP2, CONST_3, TMP2);
44f54ceb 2757
bb270c08 2758 vis_padd16(TMP0, TMP4, TMP0);
44f54ceb 2759
bb270c08
DB
2760 vis_padd16(TMP2, TMP6, TMP2);
2761 vis_pack16(TMP0, DST_2);
44f54ceb 2762
bb270c08
DB
2763 vis_pack16(TMP2, DST_3);
2764 vis_st64(DST_2, dest[8]);