Altivec on non darwin systems patch by Romain Dolbeau
[libav.git] / libavcodec / ppc / fft_altivec.c
1 /*
2 * FFT/IFFT transforms
3 * AltiVec-enabled
4 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
5 * Based on code Copyright (c) 2002 Fabrice Bellard.
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21 #include "../dsputil.h"
22
23 #include "dsputil_altivec.h"
24
25 /*
26 those three macros are from libavcodec/fft.c
27 and are required for the reference C code
28 */
29 /* butter fly op */
30 #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
31 {\
32 FFTSample ax, ay, bx, by;\
33 bx=pre1;\
34 by=pim1;\
35 ax=qre1;\
36 ay=qim1;\
37 pre = (bx + ax);\
38 pim = (by + ay);\
39 qre = (bx - ax);\
40 qim = (by - ay);\
41 }
42 #define MUL16(a,b) ((a) * (b))
43 #define CMUL(pre, pim, are, aim, bre, bim) \
44 {\
45 pre = (MUL16(are, bre) - MUL16(aim, bim));\
46 pim = (MUL16(are, bim) + MUL16(bre, aim));\
47 }
48
49
50 /**
51 * Do a complex FFT with the parameters defined in fft_init(). The
52 * input data must be permuted before with s->revtab table. No
53 * 1.0/sqrt(n) normalization is done.
54 * AltiVec-enabled
55 * This code assumes that the 'z' pointer is 16 bytes-aligned
56 * It also assumes all FFTComplex are 8 bytes-aligned pair of float
57 * The code is exactly the same as the SSE version, except
58 * that successive MUL + ADD/SUB have been merged into
59 * fused multiply-add ('vec_madd' in altivec)
60 */
61 void fft_calc_altivec(FFTContext *s, FFTComplex *z)
62 {
63 POWERPC_TBL_DECLARE(altivec_fft_num, s->nbits >= 6);
64 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
65 int ln = s->nbits;
66 int j, np, np2;
67 int nblocks, nloops;
68 register FFTComplex *p, *q;
69 FFTComplex *exptab = s->exptab;
70 int l;
71 FFTSample tmp_re, tmp_im;
72
73 POWERPC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6);
74
75 np = 1 << ln;
76
77 /* pass 0 */
78
79 p=&z[0];
80 j=(np >> 1);
81 do {
82 BF(p[0].re, p[0].im, p[1].re, p[1].im,
83 p[0].re, p[0].im, p[1].re, p[1].im);
84 p+=2;
85 } while (--j != 0);
86
87 /* pass 1 */
88
89
90 p=&z[0];
91 j=np >> 2;
92 if (s->inverse) {
93 do {
94 BF(p[0].re, p[0].im, p[2].re, p[2].im,
95 p[0].re, p[0].im, p[2].re, p[2].im);
96 BF(p[1].re, p[1].im, p[3].re, p[3].im,
97 p[1].re, p[1].im, -p[3].im, p[3].re);
98 p+=4;
99 } while (--j != 0);
100 } else {
101 do {
102 BF(p[0].re, p[0].im, p[2].re, p[2].im,
103 p[0].re, p[0].im, p[2].re, p[2].im);
104 BF(p[1].re, p[1].im, p[3].re, p[3].im,
105 p[1].re, p[1].im, p[3].im, -p[3].re);
106 p+=4;
107 } while (--j != 0);
108 }
109 /* pass 2 .. ln-1 */
110
111 nblocks = np >> 3;
112 nloops = 1 << 2;
113 np2 = np >> 1;
114 do {
115 p = z;
116 q = z + nloops;
117 for (j = 0; j < nblocks; ++j) {
118 BF(p->re, p->im, q->re, q->im,
119 p->re, p->im, q->re, q->im);
120
121 p++;
122 q++;
123 for(l = nblocks; l < np2; l += nblocks) {
124 CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
125 BF(p->re, p->im, q->re, q->im,
126 p->re, p->im, tmp_re, tmp_im);
127 p++;
128 q++;
129 }
130
131 p += nloops;
132 q += nloops;
133 }
134 nblocks = nblocks >> 1;
135 nloops = nloops << 1;
136 } while (nblocks != 0);
137
138 POWERPC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
139
140 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
141 #ifdef CONFIG_DARWIN
142 register const vector float vczero = (const vector float)(0.);
143 #else
144 register const vector float vczero = (const vector float){0.,0.,0.,0.};
145 #endif
146
147 int ln = s->nbits;
148 int j, np, np2;
149 int nblocks, nloops;
150 register FFTComplex *p, *q;
151 FFTComplex *cptr, *cptr1;
152 int k;
153
154 POWERPC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6);
155
156 np = 1 << ln;
157
158 {
159 vector float *r, a, b, a1, c1, c2;
160
161 r = (vector float *)&z[0];
162
163 c1 = vcii(p,p,n,n);
164
165 if (s->inverse)
166 {
167 c2 = vcii(p,p,n,p);
168 }
169 else
170 {
171 c2 = vcii(p,p,p,n);
172 }
173
174 j = (np >> 2);
175 do {
176 a = vec_ld(0, r);
177 a1 = vec_ld(sizeof(vector float), r);
178
179 b = vec_perm(a,a,vcprmle(1,0,3,2));
180 a = vec_madd(a,c1,b);
181 /* do the pass 0 butterfly */
182
183 b = vec_perm(a1,a1,vcprmle(1,0,3,2));
184 b = vec_madd(a1,c1,b);
185 /* do the pass 0 butterfly */
186
187 /* multiply third by -i */
188 b = vec_perm(b,b,vcprmle(2,3,1,0));
189
190 /* do the pass 1 butterfly */
191 vec_st(vec_madd(b,c2,a), 0, r);
192 vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
193
194 r += 2;
195 } while (--j != 0);
196 }
197 /* pass 2 .. ln-1 */
198
199 nblocks = np >> 3;
200 nloops = 1 << 2;
201 np2 = np >> 1;
202
203 cptr1 = s->exptab1;
204 do {
205 p = z;
206 q = z + nloops;
207 j = nblocks;
208 do {
209 cptr = cptr1;
210 k = nloops >> 1;
211 do {
212 vector float a,b,c,t1;
213
214 a = vec_ld(0, (float*)p);
215 b = vec_ld(0, (float*)q);
216
217 /* complex mul */
218 c = vec_ld(0, (float*)cptr);
219 /* cre*re cim*re */
220 t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
221 c = vec_ld(sizeof(vector float), (float*)cptr);
222 /* -cim*im cre*im */
223 b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
224
225 /* butterfly */
226 vec_st(vec_add(a,b), 0, (float*)p);
227 vec_st(vec_sub(a,b), 0, (float*)q);
228
229 p += 2;
230 q += 2;
231 cptr += 4;
232 } while (--k);
233
234 p += nloops;
235 q += nloops;
236 } while (--j);
237 cptr1 += nloops * 2;
238 nblocks = nblocks >> 1;
239 nloops = nloops << 1;
240 } while (nblocks != 0);
241
242 POWERPC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
243
244 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
245 }