4 * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
5 * Based on code Copyright (c) 2002 Fabrice Bellard.
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 #include "../dsputil.h"
23 #include "dsputil_altivec.h"
26 those three macros are from libavcodec/fft.c
27 and are required for the reference C code
30 #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
32 FFTSample ax, ay, bx, by;\
42 #define MUL16(a,b) ((a) * (b))
43 #define CMUL(pre, pim, are, aim, bre, bim) \
45 pre = (MUL16(are, bre) - MUL16(aim, bim));\
46 pim = (MUL16(are, bim) + MUL16(bre, aim));\
51 * Do a complex FFT with the parameters defined in fft_init(). The
52 * input data must be permuted before with s->revtab table. No
53 * 1.0/sqrt(n) normalization is done.
55 * This code assumes that the 'z' pointer is 16 bytes-aligned
56 * It also assumes all FFTComplex are 8 bytes-aligned pair of float
57 * The code is exactly the same as the SSE version, except
58 * that successive MUL + ADD/SUB have been merged into
59 * fused multiply-add ('vec_madd' in altivec)
61 void fft_calc_altivec(FFTContext
*s
, FFTComplex
*z
)
63 POWERPC_TBL_DECLARE(altivec_fft_num
, s
->nbits
>= 6);
64 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
68 register FFTComplex
*p
, *q
;
69 FFTComplex
*exptab
= s
->exptab
;
71 FFTSample tmp_re
, tmp_im
;
73 POWERPC_TBL_START_COUNT(altivec_fft_num
, s
->nbits
>= 6);
82 BF(p
[0].re
, p
[0].im
, p
[1].re
, p
[1].im
,
83 p
[0].re
, p
[0].im
, p
[1].re
, p
[1].im
);
94 BF(p
[0].re
, p
[0].im
, p
[2].re
, p
[2].im
,
95 p
[0].re
, p
[0].im
, p
[2].re
, p
[2].im
);
96 BF(p
[1].re
, p
[1].im
, p
[3].re
, p
[3].im
,
97 p
[1].re
, p
[1].im
, -p
[3].im
, p
[3].re
);
102 BF(p
[0].re
, p
[0].im
, p
[2].re
, p
[2].im
,
103 p
[0].re
, p
[0].im
, p
[2].re
, p
[2].im
);
104 BF(p
[1].re
, p
[1].im
, p
[3].re
, p
[3].im
,
105 p
[1].re
, p
[1].im
, p
[3].im
, -p
[3].re
);
117 for (j
= 0; j
< nblocks
; ++j
) {
118 BF(p
->re
, p
->im
, q
->re
, q
->im
,
119 p
->re
, p
->im
, q
->re
, q
->im
);
123 for(l
= nblocks
; l
< np2
; l
+= nblocks
) {
124 CMUL(tmp_re
, tmp_im
, exptab
[l
].re
, exptab
[l
].im
, q
->re
, q
->im
);
125 BF(p
->re
, p
->im
, q
->re
, q
->im
,
126 p
->re
, p
->im
, tmp_re
, tmp_im
);
134 nblocks
= nblocks
>> 1;
135 nloops
= nloops
<< 1;
136 } while (nblocks
!= 0);
138 POWERPC_TBL_STOP_COUNT(altivec_fft_num
, s
->nbits
>= 6);
140 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
142 register const vector
float vczero
= (const vector
float)(0.);
144 register const vector
float vczero
= (const vector
float){0.,0.,0.,0.};
150 register FFTComplex
*p
, *q
;
151 FFTComplex
*cptr
, *cptr1
;
154 POWERPC_TBL_START_COUNT(altivec_fft_num
, s
->nbits
>= 6);
159 vector
float *r
, a
, b
, a1
, c1
, c2
;
161 r
= (vector
float *)&z
[0];
177 a1
= vec_ld(sizeof(vector
float), r
);
179 b
= vec_perm(a
,a
,vcprmle(1,0,3,2));
180 a
= vec_madd(a
,c1
,b
);
181 /* do the pass 0 butterfly */
183 b
= vec_perm(a1
,a1
,vcprmle(1,0,3,2));
184 b
= vec_madd(a1
,c1
,b
);
185 /* do the pass 0 butterfly */
187 /* multiply third by -i */
188 b
= vec_perm(b
,b
,vcprmle(2,3,1,0));
190 /* do the pass 1 butterfly */
191 vec_st(vec_madd(b
,c2
,a
), 0, r
);
192 vec_st(vec_nmsub(b
,c2
,a
), sizeof(vector
float), r
);
212 vector
float a
,b
,c
,t1
;
214 a
= vec_ld(0, (float*)p
);
215 b
= vec_ld(0, (float*)q
);
218 c
= vec_ld(0, (float*)cptr
);
220 t1
= vec_madd(c
, vec_perm(b
,b
,vcprmle(2,2,0,0)),vczero
);
221 c
= vec_ld(sizeof(vector
float), (float*)cptr
);
223 b
= vec_madd(c
, vec_perm(b
,b
,vcprmle(3,3,1,1)),t1
);
226 vec_st(vec_add(a
,b
), 0, (float*)p
);
227 vec_st(vec_sub(a
,b
), 0, (float*)q
);
238 nblocks
= nblocks
>> 1;
239 nloops
= nloops
<< 1;
240 } while (nblocks
!= 0);
242 POWERPC_TBL_STOP_COUNT(altivec_fft_num
, s
->nbits
>= 6);
244 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */