Commit | Line | Data |
---|---|---|
8d268a7d FB |
1 | /* |
2 | * FFT/IFFT transforms | |
3 | * AltiVec-enabled | |
bf7ba153 | 4 | * Copyright (c) 2009 Loren Merritt |
8d268a7d | 5 | * |
2912e87a | 6 | * This file is part of Libav. |
b78e7197 | 7 | * |
2912e87a | 8 | * Libav is free software; you can redistribute it and/or |
8d268a7d FB |
9 | * modify it under the terms of the GNU Lesser General Public |
10 | * License as published by the Free Software Foundation; either | |
b78e7197 | 11 | * version 2.1 of the License, or (at your option) any later version. |
8d268a7d | 12 | * |
2912e87a | 13 | * Libav is distributed in the hope that it will be useful, |
8d268a7d FB |
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * Lesser General Public License for more details. | |
17 | * | |
18 | * You should have received a copy of the GNU Lesser General Public | |
2912e87a | 19 | * License along with Libav; if not, write to the Free Software |
5509bffa | 20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
8d268a7d | 21 | */ |
98db4e2a | 22 | |
38282149 | 23 | #include "config.h" |
6af2c351 | 24 | #include "libavutil/cpu.h" |
27860819 | 25 | #include "libavutil/ppc/cpu.h" |
98db4e2a JR |
26 | #include "libavutil/ppc/types_altivec.h" |
27 | #include "libavutil/ppc/util_altivec.h" | |
1429224b | 28 | #include "libavcodec/fft.h" |
ddb8c2c0 | 29 | |
8d268a7d | 30 | /** |
82ee14d2 DB |
31 | * Do a complex FFT with the parameters defined in ff_fft_init(). |
32 | * The input data must be permuted before with s->revtab table. | |
33 | * No 1.0 / sqrt(n) normalization is done. | |
34 | * AltiVec-enabled: | |
35 | * This code assumes that the 'z' pointer is 16 bytes-aligned. | |
36 | * It also assumes all FFTComplex are 8 bytes-aligned pairs of floats. | |
8d268a7d | 37 | */ |
115329f1 | 38 | |
a46b84d1 MR |
39 | void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); |
40 | void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z); | |
115329f1 | 41 | |
da60b99a | 42 | #if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN |
7f75f2f2 | 43 | static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) |
cf61994a LM |
44 | { |
45 | int j, k; | |
46 | int n = 1 << s->mdct_bits; | |
47 | int n4 = n >> 2; | |
48 | int n8 = n >> 3; | |
49 | int n32 = n >> 5; | |
50 | const uint16_t *revtabj = s->revtab; | |
51 | const uint16_t *revtabk = s->revtab+n4; | |
52 | const vec_f *tcos = (const vec_f*)(s->tcos+n8); | |
53 | const vec_f *tsin = (const vec_f*)(s->tsin+n8); | |
54 | const vec_f *pin = (const vec_f*)(input+n4); | |
55 | vec_f *pout = (vec_f*)(output+n4); | |
56 | ||
57 | /* pre rotation */ | |
58 | k = n32-1; | |
59 | do { | |
60 | vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d; | |
61 | #define CMULA(p,o0,o1,o2,o3)\ | |
62 | a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\ | |
63 | b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\ | |
64 | re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\ | |
65 | im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\ | |
66 | cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\ | |
67 | sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\ | |
68 | r##p = im*cos - re*sin;\ | |
69 | i##p = re*cos + im*sin; | |
70 | #define STORE2(v,dst)\ | |
71 | j = dst;\ | |
72 | vec_ste(v, 0, output+j*2);\ | |
73 | vec_ste(v, 4, output+j*2); | |
74 | #define STORE8(p)\ | |
75 | a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\ | |
76 | b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\ | |
77 | c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\ | |
78 | d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\ | |
79 | STORE2(a, revtabk[ p*2-4]);\ | |
80 | STORE2(b, revtabk[ p*2-3]);\ | |
81 | STORE2(c, revtabj[-p*2+2]);\ | |
82 | STORE2(d, revtabj[-p*2+3]); | |
83 | ||
84 | cos0 = tcos[k]; | |
85 | sin0 = tsin[k]; | |
86 | cos1 = tcos[-k-1]; | |
87 | sin1 = tsin[-k-1]; | |
88 | CMULA(0, 0,1,2,3); | |
89 | CMULA(1, 2,3,0,1); | |
90 | STORE8(0); | |
91 | STORE8(1); | |
92 | revtabj += 4; | |
93 | revtabk -= 4; | |
94 | k--; | |
95 | } while(k >= 0); | |
96 | ||
a46b84d1 | 97 | ff_fft_calc_altivec(s, (FFTComplex*)output); |
cf61994a LM |
98 | |
99 | /* post rotation + reordering */ | |
100 | j = -n32; | |
101 | k = n32-1; | |
102 | do { | |
103 | vec_f cos,sin,re,im,a,b,c,d; | |
104 | #define CMULB(d0,d1,o)\ | |
105 | re = pout[o*2];\ | |
106 | im = pout[o*2+1];\ | |
107 | cos = tcos[o];\ | |
108 | sin = tsin[o];\ | |
109 | d0 = im*sin - re*cos;\ | |
110 | d1 = re*sin + im*cos; | |
111 | ||
112 | CMULB(a,b,j); | |
113 | CMULB(c,d,k); | |
114 | pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2)); | |
115 | pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0)); | |
116 | pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2)); | |
117 | pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0)); | |
118 | j++; | |
119 | k--; | |
120 | } while(k >= 0); | |
121 | } | |
122 | ||
7f75f2f2 | 123 | static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) |
cf61994a LM |
124 | { |
125 | int k; | |
126 | int n = 1 << s->mdct_bits; | |
127 | int n4 = n >> 2; | |
128 | int n16 = n >> 4; | |
187a5379 | 129 | vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31}; |
cf61994a LM |
130 | vec_u32 *p0 = (vec_u32*)(output+n4); |
131 | vec_u32 *p1 = (vec_u32*)(output+n4*3); | |
132 | ||
7f75f2f2 | 133 | imdct_half_altivec(s, output + n4, input); |
cf61994a LM |
134 | |
135 | for (k = 0; k < n16; k++) { | |
136 | vec_u32 a = p0[k] ^ sign; | |
137 | vec_u32 b = p1[-k-1]; | |
138 | p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0)); | |
139 | p1[k] = vec_perm(b, b, vcprm(3,2,1,0)); | |
140 | } | |
141 | } | |
47570dbd | 142 | #endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ |
cf61994a | 143 | |
38282149 | 144 | av_cold void ff_fft_init_ppc(FFTContext *s) |
f4863213 | 145 | { |
da60b99a | 146 | #if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN |
27860819 | 147 | if (!PPC_ALTIVEC(av_get_cpu_flags())) |
6af2c351 DB |
148 | return; |
149 | ||
a46b84d1 | 150 | s->fft_calc = ff_fft_calc_interleave_altivec; |
6cbf2420 | 151 | if (s->mdct_bits >= 5) { |
7f75f2f2 DB |
152 | s->imdct_calc = imdct_calc_altivec; |
153 | s->imdct_half = imdct_half_altivec; | |
6cbf2420 | 154 | } |
47570dbd | 155 | #endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */ |
f4863213 | 156 | } |