Commit | Line | Data |
---|---|---|
89523bee LB |
1 | /* |
2 | * High quality image resampling with polyphase filters | |
406792e7 | 3 | * Copyright (c) 2001 Fabrice Bellard |
89523bee LB |
4 | * |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
22 | /** | |
bad5537e | 23 | * @file libavcodec/ppc/imgresample_altivec.c |
89523bee LB |
24 | * High quality image resampling with polyphase filters - AltiVec bits |
25 | */ | |
26 | ||
006c8e9e LB |
27 | #include "util_altivec.h" |
28 | #define FILTER_BITS 8 | |
89523bee LB |
29 | |
30 | typedef union { | |
89523bee LB |
31 | vector signed short v; |
32 | signed short s[8]; | |
7a8f36cc | 33 | } vec_ss; |
89523bee LB |
34 | |
35 | void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, | |
36 | int wrap, int16_t *filter) | |
37 | { | |
38 | int sum, i; | |
39 | const uint8_t *s; | |
40 | vector unsigned char *tv, tmp, dstv, zero; | |
7a8f36cc | 41 | vec_ss srchv[4], srclv[4], fv[4]; |
89523bee LB |
42 | vector signed short zeros, sumhv, sumlv; |
43 | s = src; | |
44 | ||
e3905ce0 | 45 | for(i=0;i<4;i++) { |
89523bee LB |
46 | /* |
47 | The vec_madds later on does an implicit >>15 on the result. | |
48 | Since FILTER_BITS is 8, and we have 15 bits of magnitude in | |
49 | a signed short, we have just enough bits to pre-shift our | |
50 | filter constants <<7 to compensate for vec_madds. | |
51 | */ | |
52 | fv[i].s[0] = filter[i] << (15-FILTER_BITS); | |
53 | fv[i].v = vec_splat(fv[i].v, 0); | |
54 | } | |
55 | ||
56 | zero = vec_splat_u8(0); | |
57 | zeros = vec_splat_s16(0); | |
58 | ||
59 | ||
60 | /* | |
61 | When we're resampling, we'd ideally like both our input buffers, | |
62 | and output buffers to be 16-byte aligned, so we can do both aligned | |
63 | reads and writes. Sadly we can't always have this at the moment, so | |
64 | we opt for aligned writes, as unaligned writes have a huge overhead. | |
65 | To do this, do enough scalar resamples to get dst 16-byte aligned. | |
66 | */ | |
67 | i = (-(int)dst) & 0xf; | |
68 | while(i>0) { | |
69 | sum = s[0 * wrap] * filter[0] + | |
70 | s[1 * wrap] * filter[1] + | |
71 | s[2 * wrap] * filter[2] + | |
72 | s[3 * wrap] * filter[3]; | |
73 | sum = sum >> FILTER_BITS; | |
74 | if (sum<0) sum = 0; else if (sum>255) sum=255; | |
75 | dst[0] = sum; | |
76 | dst++; | |
77 | s++; | |
78 | dst_width--; | |
79 | i--; | |
80 | } | |
81 | ||
82 | /* Do our altivec resampling on 16 pixels at once. */ | |
83 | while(dst_width>=16) { | |
e3905ce0 | 84 | /* Read 16 (potentially unaligned) bytes from each of |
89523bee LB |
85 | 4 lines into 4 vectors, and split them into shorts. |
86 | Interleave the multipy/accumulate for the resample | |
87 | filter with the loads to hide the 3 cycle latency | |
e3905ce0 | 88 | the vec_madds have. */ |
89523bee LB |
89 | tv = (vector unsigned char *) &s[0 * wrap]; |
90 | tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); | |
91 | srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); | |
92 | srclv[0].v = (vector signed short) vec_mergel(zero, tmp); | |
93 | sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); | |
94 | sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); | |
95 | ||
96 | tv = (vector unsigned char *) &s[1 * wrap]; | |
97 | tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); | |
98 | srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); | |
99 | srclv[1].v = (vector signed short) vec_mergel(zero, tmp); | |
100 | sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); | |
101 | sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); | |
102 | ||
103 | tv = (vector unsigned char *) &s[2 * wrap]; | |
104 | tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); | |
105 | srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); | |
106 | srclv[2].v = (vector signed short) vec_mergel(zero, tmp); | |
107 | sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); | |
108 | sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); | |
109 | ||
110 | tv = (vector unsigned char *) &s[3 * wrap]; | |
111 | tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); | |
112 | srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); | |
113 | srclv[3].v = (vector signed short) vec_mergel(zero, tmp); | |
114 | sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); | |
115 | sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); | |
116 | ||
e3905ce0 DB |
117 | /* Pack the results into our destination vector, |
118 | and do an aligned write of that back to memory. */ | |
89523bee LB |
119 | dstv = vec_packsu(sumhv, sumlv) ; |
120 | vec_st(dstv, 0, (vector unsigned char *) dst); | |
121 | ||
122 | dst+=16; | |
123 | s+=16; | |
124 | dst_width-=16; | |
125 | } | |
126 | ||
e3905ce0 DB |
127 | /* If there are any leftover pixels, resample them |
128 | with the slow scalar method. */ | |
89523bee LB |
129 | while(dst_width>0) { |
130 | sum = s[0 * wrap] * filter[0] + | |
131 | s[1 * wrap] * filter[1] + | |
132 | s[2 * wrap] * filter[2] + | |
133 | s[3 * wrap] * filter[3]; | |
134 | sum = sum >> FILTER_BITS; | |
135 | if (sum<0) sum = 0; else if (sum>255) sum=255; | |
136 | dst[0] = sum; | |
137 | dst++; | |
138 | s++; | |
139 | dst_width--; | |
140 | } | |
141 | } | |
142 |