PPC: Altivec split-radix FFT
[libav.git] / libavcodec / ppc / fft_altivec_s.S
CommitLineData
bf7ba153
MR
1/*
2 * FFT transform with Altivec optimizations
3 * Copyright (c) 2009 Loren Merritt
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22/*
23 * These functions are not individually interchangeable with the C versions.
24 * While C takes arrays of FFTComplex, Altivec leaves intermediate results
25 * in blocks as convenient to the vector size.
26 * i.e. {4x real, 4x imaginary, 4x real, ...}
27 *
28 * I ignore standard calling convention.
29 * Instead, the following registers are treated as global constants:
30 * v14: zero
31 * v15..v18: cosines
32 * v19..v29: permutations
33 * r9: 16
34 * r12: ff_cos_tabs
35 * and the rest are free for local use.
36 */
37
38#include "config.h"
39#include "asm.S"
40
41.text
42
43/* Apple gas doesn't support this shorthand */
44.macro mtctr rx
45 mtspr 9, \rx
46.endm
47
48.macro addi2 ra, imm // add 32-bit immediate
49.if \imm & 0xffff
50 addi \ra, \ra, \imm@l
51.endif
52.if (\imm+0x8000)>>16
53 addis \ra, \ra, \imm@ha
54.endif
55.endm
56
57#if ARCH_PPC64
58#define PTR .quad
59.macro LOAD_PTR ra, rbase, offset
60 ld \ra,(\offset)*8(\rbase)
61.endm
62.macro STORE_PTR ra, rbase, offset
63 std \ra,(\offset)*8(\rbase)
64.endm
65#else
66#define PTR .int
67.macro LOAD_PTR ra, rbase, offset
68 lwz \ra,(\offset)*4(\rbase)
69.endm
70.macro STORE_PTR ra, rbase, offset
71 stw \ra,(\offset)*4(\rbase)
72.endm
73#endif
74
75.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
76 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
77 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
78 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
79 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
80 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
81 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
82 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
83 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
84 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
85 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
86.endm
87
88.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
89 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
90 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
91 vperm \b2,\b0,\b1,v20
92 vperm \b3,\b0,\b1,v21
93 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
94 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
95 vaddfp \b0,\b2,\b3
96 vsubfp \b1,\b2,\b3
97 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
98 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
99 vmrghw \b2,\b0,\b1
100 vperm \b3,\b0,\b1,v22
101 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
102 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
103 vaddfp \b0,\b2,\b3
104 vsubfp \b1,\b2,\b3
105 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
106 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
107 vperm \b2,\b0,\b1,v23
108 vperm \b3,\b0,\b1,v24
109.endm
110
111.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
112 vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
113 vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
114 vperm \a2,\a0,\a1,v20 // FFT4 ...
115 vperm \a3,\a0,\a1,v21
116 vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
117 vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
118 vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
119 vaddfp \a0,\a2,\a3
120 vsubfp \a1,\a2,\a3
121 vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
122 vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
123 vmrghw \a2,\a0,\a1
124 vperm \a3,\a0,\a1,v22
125 vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
126 vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
127 vaddfp \a0,\a2,\a3
128 vsubfp \a1,\a2,\a3
129 vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
130 vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
131 vperm \a2,\a0,\a1,v23
132 vperm \a3,\a0,\a1,v24
133 vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
134 vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
135 vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
136 vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
137 vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
138 vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
139.endm
140
141.macro BF d0,d1,s0,s1
142 vsubfp \d1,\s0,\s1
143 vaddfp \d0,\s0,\s1
144.endm
145
146fft4_altivec:
147 lvx v0, 0,r3
148 lvx v1,r9,r3
149 FFT4 v0,v1,v2,v3
150 stvx v2, 0,r3
151 stvx v3,r9,r3
152 blr
153
154fft8_altivec:
155 addi r4,r3,32
156 lvx v0, 0,r3
157 lvx v1,r9,r3
158 lvx v2, 0,r4
159 lvx v3,r9,r4
160 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
161 stvx v0, 0,r3
162 stvx v1,r9,r3
163 stvx v2, 0,r4
164 stvx v3,r9,r4
165 blr
166
167fft16_altivec:
168 addi r5,r3,64
169 addi r6,r3,96
170 addi r4,r3,32
171 lvx v0, 0,r5
172 lvx v1,r9,r5
173 lvx v2, 0,r6
174 lvx v3,r9,r6
175 FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
176 lvx v0, 0,r3
177 lvx v1,r9,r3
178 lvx v2, 0,r4
179 lvx v3,r9,r4
180 FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
181 vmaddfp v8,v4,v15,v14 // r2*wre
182 vmaddfp v9,v5,v15,v14 // i2*wre
183 vmaddfp v10,v6,v15,v14 // r3*wre
184 vmaddfp v11,v7,v15,v14 // i3*wre
185 vmaddfp v8,v5,v16,v8 // i2*wim
186 vnmsubfp v9,v4,v16,v9 // r2*wim
187 vnmsubfp v10,v7,v16,v10 // i3*wim
188 vmaddfp v11,v6,v16,v11 // r3*wim
189 BF v10,v12,v10,v8
190 BF v11,v13,v9,v11
191 BF v0,v4,v0,v10
192 BF v3,v7,v3,v12
193 stvx v0, 0,r3
194 stvx v4, 0,r5
195 stvx v3,r9,r4
196 stvx v7,r9,r6
197 BF v1,v5,v1,v11
198 BF v2,v6,v2,v13
199 stvx v1,r9,r3
200 stvx v5,r9,r5
201 stvx v2, 0,r4
202 stvx v6, 0,r6
203 blr
204
205// void pass(float *z, float *wre, int n)
206.macro PASS interleave, suffix
207fft_pass\suffix\()_altivec:
208 mtctr r5
209 slwi r0,r5,4
210 slwi r7,r5,6 // o2
211 slwi r5,r5,5 // o1
212 add r10,r5,r7 // o3
213 add r0,r4,r0 // wim
214 addi r6,r5,16 // o1+16
215 addi r8,r7,16 // o2+16
216 addi r11,r10,16 // o3+16
2171:
218 lvx v8, 0,r4 // wre
219 lvx v10, 0,r0 // wim
220 sub r0,r0,r9
221 lvx v9, 0,r0
222 vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
223 lvx v4,r3,r7 // r2 = z[o2]
224 lvx v5,r3,r8 // i2 = z[o2+16]
225 lvx v6,r3,r10 // r3 = z[o3]
226 lvx v7,r3,r11 // i3 = z[o3+16]
227 vmaddfp v10,v4,v8,v14 // r2*wre
228 vmaddfp v11,v5,v8,v14 // i2*wre
229 vmaddfp v12,v6,v8,v14 // r3*wre
230 vmaddfp v13,v7,v8,v14 // i3*wre
231 lvx v0, 0,r3 // r0 = z[0]
232 lvx v3,r3,r6 // i1 = z[o1+16]
233 vmaddfp v10,v5,v9,v10 // i2*wim
234 vnmsubfp v11,v4,v9,v11 // r2*wim
235 vnmsubfp v12,v7,v9,v12 // i3*wim
236 vmaddfp v13,v6,v9,v13 // r3*wim
237 lvx v1,r3,r9 // i0 = z[16]
238 lvx v2,r3,r5 // r1 = z[o1]
239 BF v12,v8,v12,v10
240 BF v13,v9,v11,v13
241 BF v0,v4,v0,v12
242 BF v3,v7,v3,v8
243.if !\interleave
244 stvx v0, 0,r3
245 stvx v4,r3,r7
246 stvx v3,r3,r6
247 stvx v7,r3,r11
248.endif
249 BF v1,v5,v1,v13
250 BF v2,v6,v2,v9
251.if !\interleave
252 stvx v1,r3,r9
253 stvx v2,r3,r5
254 stvx v5,r3,r8
255 stvx v6,r3,r10
256.else
257 vmrghw v8,v0,v1
258 vmrglw v9,v0,v1
259 stvx v8, 0,r3
260 stvx v9,r3,r9
261 vmrghw v8,v2,v3
262 vmrglw v9,v2,v3
263 stvx v8,r3,r5
264 stvx v9,r3,r6
265 vmrghw v8,v4,v5
266 vmrglw v9,v4,v5
267 stvx v8,r3,r7
268 stvx v9,r3,r8
269 vmrghw v8,v6,v7
270 vmrglw v9,v6,v7
271 stvx v8,r3,r10
272 stvx v9,r3,r11
273.endif
274 addi r3,r3,32
275 addi r4,r4,16
276 bdnz 1b
277 sub r3,r3,r5
278 blr
279.endm
280
281.macro DECL_FFT suffix, bits, n, n2, n4
282fft\n\suffix\()_altivec:
283 mflr r0
284 STORE_PTR r0,r1,\bits-5
285 bl fft\n2\()_altivec
286 addi2 r3,\n*4
287 bl fft\n4\()_altivec
288 addi2 r3,\n*2
289 bl fft\n4\()_altivec
290 addi2 r3,\n*-6
291 LOAD_PTR r0,r1,\bits-5
292 LOAD_PTR r4,r12,\bits
293 mtlr r0
294 li r5,\n/16
295 b fft_pass\suffix\()_altivec
296.endm
297
298.macro DECL_FFTS interleave, suffix
299 .text
300 PASS \interleave, \suffix
301 DECL_FFT \suffix, 5, 32, 16, 8
302 DECL_FFT \suffix, 6, 64, 32, 16
303 DECL_FFT \suffix, 7, 128, 64, 32
304 DECL_FFT \suffix, 8, 256, 128, 64
305 DECL_FFT \suffix, 9, 512, 256, 128
306 DECL_FFT \suffix,10, 1024, 512, 256
307 DECL_FFT \suffix,11, 2048, 1024, 512
308 DECL_FFT \suffix,12, 4096, 2048, 1024
309 DECL_FFT \suffix,13, 8192, 4096, 2048
310 DECL_FFT \suffix,14,16384, 8192, 4096
311 DECL_FFT \suffix,15,32768,16384, 8192
312 DECL_FFT \suffix,16,65536,32768,16384
313
314 .rodata
315 .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec
316EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec:
317 PTR fft4_altivec
318 PTR fft8_altivec
319 PTR fft16_altivec
320 PTR fft32\suffix\()_altivec
321 PTR fft64\suffix\()_altivec
322 PTR fft128\suffix\()_altivec
323 PTR fft256\suffix\()_altivec
324 PTR fft512\suffix\()_altivec
325 PTR fft1024\suffix\()_altivec
326 PTR fft2048\suffix\()_altivec
327 PTR fft4096\suffix\()_altivec
328 PTR fft8192\suffix\()_altivec
329 PTR fft16384\suffix\()_altivec
330 PTR fft32768\suffix\()_altivec
331 PTR fft65536\suffix\()_altivec
332.endm
333
334DECL_FFTS 0
335DECL_FFTS 1, _interleave