PPC: gas-preprocessor handles m[ft]spr shorthands
[libav.git] / libavcodec / ppc / fft_altivec_s.S
1 /*
2 * FFT transform with Altivec optimizations
3 * Copyright (c) 2009 Loren Merritt
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 /*
23 * These functions are not individually interchangeable with the C versions.
24 * While C takes arrays of FFTComplex, Altivec leaves intermediate results
25 * in blocks as convenient to the vector size.
26 * i.e. {4x real, 4x imaginary, 4x real, ...}
27 *
28 * I ignore standard calling convention.
29 * Instead, the following registers are treated as global constants:
30 * v14: zero
31 * v15..v18: cosines
32 * v19..v29: permutations
33 * r9: 16
34 * r12: ff_cos_tabs
35 * and the rest are free for local use.
36 */
37
38 #include "config.h"
39 #include "asm.S"
40
41 .text
42
43 .macro addi2 ra, imm // add 32-bit immediate
44 .if \imm & 0xffff
45 addi \ra, \ra, \imm@l
46 .endif
47 .if (\imm+0x8000)>>16
48 addis \ra, \ra, \imm@ha
49 .endif
50 .endm
51
52 #if ARCH_PPC64
53 #define PTR .quad
54 .macro LOAD_PTR ra, rbase, offset
55 ld \ra,(\offset)*8(\rbase)
56 .endm
57 .macro STORE_PTR ra, rbase, offset
58 std \ra,(\offset)*8(\rbase)
59 .endm
60 #else
61 #define PTR .int
62 .macro LOAD_PTR ra, rbase, offset
63 lwz \ra,(\offset)*4(\rbase)
64 .endm
65 .macro STORE_PTR ra, rbase, offset
66 stw \ra,(\offset)*4(\rbase)
67 .endm
68 #endif
69
70 .macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
71 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
72 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
73 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
74 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
75 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
76 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
77 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
78 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
79 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
80 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
81 .endm
82
83 .macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
84 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
85 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
86 vperm \b2,\b0,\b1,v20
87 vperm \b3,\b0,\b1,v21
88 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
89 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
90 vaddfp \b0,\b2,\b3
91 vsubfp \b1,\b2,\b3
92 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
93 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
94 vmrghw \b2,\b0,\b1
95 vperm \b3,\b0,\b1,v22
96 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
97 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
98 vaddfp \b0,\b2,\b3
99 vsubfp \b1,\b2,\b3
100 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
101 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
102 vperm \b2,\b0,\b1,v23
103 vperm \b3,\b0,\b1,v24
104 .endm
105
106 .macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
107 vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
108 vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
109 vperm \a2,\a0,\a1,v20 // FFT4 ...
110 vperm \a3,\a0,\a1,v21
111 vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
112 vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
113 vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
114 vaddfp \a0,\a2,\a3
115 vsubfp \a1,\a2,\a3
116 vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
117 vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
118 vmrghw \a2,\a0,\a1
119 vperm \a3,\a0,\a1,v22
120 vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
121 vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
122 vaddfp \a0,\a2,\a3
123 vsubfp \a1,\a2,\a3
124 vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
125 vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
126 vperm \a2,\a0,\a1,v23
127 vperm \a3,\a0,\a1,v24
128 vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
129 vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
130 vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
131 vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
132 vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
133 vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
134 .endm
135
136 .macro BF d0,d1,s0,s1
137 vsubfp \d1,\s0,\s1
138 vaddfp \d0,\s0,\s1
139 .endm
140
141 .macro zip d0,d1,s0,s1
142 vmrghw \d0,\s0,\s1
143 vmrglw \d1,\s0,\s1
144 .endm
145
146 .macro def_fft4 interleave
147 fft4\interleave\()_altivec:
148 lvx v0, 0,r3
149 lvx v1,r9,r3
150 FFT4 v0,v1,v2,v3
151 .ifnb \interleave
152 zip v0,v1,v2,v3
153 stvx v0, 0,r3
154 stvx v1,r9,r3
155 .else
156 stvx v2, 0,r3
157 stvx v3,r9,r3
158 .endif
159 blr
160 .endm
161
162 .macro def_fft8 interleave
163 fft8\interleave\()_altivec:
164 addi r4,r3,32
165 lvx v0, 0,r3
166 lvx v1,r9,r3
167 lvx v2, 0,r4
168 lvx v3,r9,r4
169 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
170 .ifnb \interleave
171 zip v4,v5,v0,v1
172 zip v6,v7,v2,v3
173 stvx v4, 0,r3
174 stvx v5,r9,r3
175 stvx v6, 0,r4
176 stvx v7,r9,r4
177 .else
178 stvx v0, 0,r3
179 stvx v1,r9,r3
180 stvx v2, 0,r4
181 stvx v3,r9,r4
182 .endif
183 blr
184 .endm
185
186 .macro def_fft16 interleave
187 fft16\interleave\()_altivec:
188 addi r5,r3,64
189 addi r6,r3,96
190 addi r4,r3,32
191 lvx v0, 0,r5
192 lvx v1,r9,r5
193 lvx v2, 0,r6
194 lvx v3,r9,r6
195 FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
196 lvx v0, 0,r3
197 lvx v1,r9,r3
198 lvx v2, 0,r4
199 lvx v3,r9,r4
200 FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
201 vmaddfp v8,v4,v15,v14 // r2*wre
202 vmaddfp v9,v5,v15,v14 // i2*wre
203 vmaddfp v10,v6,v15,v14 // r3*wre
204 vmaddfp v11,v7,v15,v14 // i3*wre
205 vmaddfp v8,v5,v16,v8 // i2*wim
206 vnmsubfp v9,v4,v16,v9 // r2*wim
207 vnmsubfp v10,v7,v16,v10 // i3*wim
208 vmaddfp v11,v6,v16,v11 // r3*wim
209 BF v10,v12,v10,v8
210 BF v11,v13,v9,v11
211 BF v0,v4,v0,v10
212 BF v3,v7,v3,v12
213 BF v1,v5,v1,v11
214 BF v2,v6,v2,v13
215 .ifnb \interleave
216 zip v8, v9,v0,v1
217 zip v10,v11,v2,v3
218 zip v12,v13,v4,v5
219 zip v14,v15,v6,v7
220 stvx v8, 0,r3
221 stvx v9,r9,r3
222 stvx v10, 0,r4
223 stvx v11,r9,r4
224 stvx v12, 0,r5
225 stvx v13,r9,r5
226 stvx v14, 0,r6
227 stvx v15,r9,r6
228 .else
229 stvx v0, 0,r3
230 stvx v4, 0,r5
231 stvx v3,r9,r4
232 stvx v7,r9,r6
233 stvx v1,r9,r3
234 stvx v5,r9,r5
235 stvx v2, 0,r4
236 stvx v6, 0,r6
237 .endif
238 blr
239 .endm
240
241 // void pass(float *z, float *wre, int n)
242 .macro PASS interleave, suffix
243 fft_pass\suffix\()_altivec:
244 mtctr r5
245 slwi r0,r5,4
246 slwi r7,r5,6 // o2
247 slwi r5,r5,5 // o1
248 add r10,r5,r7 // o3
249 add r0,r4,r0 // wim
250 addi r6,r5,16 // o1+16
251 addi r8,r7,16 // o2+16
252 addi r11,r10,16 // o3+16
253 1:
254 lvx v8, 0,r4 // wre
255 lvx v10, 0,r0 // wim
256 sub r0,r0,r9
257 lvx v9, 0,r0
258 vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
259 lvx v4,r3,r7 // r2 = z[o2]
260 lvx v5,r3,r8 // i2 = z[o2+16]
261 lvx v6,r3,r10 // r3 = z[o3]
262 lvx v7,r3,r11 // i3 = z[o3+16]
263 vmaddfp v10,v4,v8,v14 // r2*wre
264 vmaddfp v11,v5,v8,v14 // i2*wre
265 vmaddfp v12,v6,v8,v14 // r3*wre
266 vmaddfp v13,v7,v8,v14 // i3*wre
267 lvx v0, 0,r3 // r0 = z[0]
268 lvx v3,r3,r6 // i1 = z[o1+16]
269 vmaddfp v10,v5,v9,v10 // i2*wim
270 vnmsubfp v11,v4,v9,v11 // r2*wim
271 vnmsubfp v12,v7,v9,v12 // i3*wim
272 vmaddfp v13,v6,v9,v13 // r3*wim
273 lvx v1,r3,r9 // i0 = z[16]
274 lvx v2,r3,r5 // r1 = z[o1]
275 BF v12,v8,v12,v10
276 BF v13,v9,v11,v13
277 BF v0,v4,v0,v12
278 BF v3,v7,v3,v8
279 .if !\interleave
280 stvx v0, 0,r3
281 stvx v4,r3,r7
282 stvx v3,r3,r6
283 stvx v7,r3,r11
284 .endif
285 BF v1,v5,v1,v13
286 BF v2,v6,v2,v9
287 .if !\interleave
288 stvx v1,r3,r9
289 stvx v2,r3,r5
290 stvx v5,r3,r8
291 stvx v6,r3,r10
292 .else
293 vmrghw v8,v0,v1
294 vmrglw v9,v0,v1
295 stvx v8, 0,r3
296 stvx v9,r3,r9
297 vmrghw v8,v2,v3
298 vmrglw v9,v2,v3
299 stvx v8,r3,r5
300 stvx v9,r3,r6
301 vmrghw v8,v4,v5
302 vmrglw v9,v4,v5
303 stvx v8,r3,r7
304 stvx v9,r3,r8
305 vmrghw v8,v6,v7
306 vmrglw v9,v6,v7
307 stvx v8,r3,r10
308 stvx v9,r3,r11
309 .endif
310 addi r3,r3,32
311 addi r4,r4,16
312 bdnz 1b
313 sub r3,r3,r5
314 blr
315 .endm
316
317 .macro DECL_FFT suffix, bits, n, n2, n4
318 fft\n\suffix\()_altivec:
319 mflr r0
320 STORE_PTR r0,r1,\bits-5
321 bl fft\n2\()_altivec
322 addi2 r3,\n*4
323 bl fft\n4\()_altivec
324 addi2 r3,\n*2
325 bl fft\n4\()_altivec
326 addi2 r3,\n*-6
327 LOAD_PTR r0,r1,\bits-5
328 LOAD_PTR r4,r12,\bits
329 mtlr r0
330 li r5,\n/16
331 b fft_pass\suffix\()_altivec
332 .endm
333
334 .macro DECL_FFTS interleave, suffix
335 .text
336 def_fft4 \suffix
337 def_fft8 \suffix
338 def_fft16 \suffix
339 PASS \interleave, \suffix
340 DECL_FFT \suffix, 5, 32, 16, 8
341 DECL_FFT \suffix, 6, 64, 32, 16
342 DECL_FFT \suffix, 7, 128, 64, 32
343 DECL_FFT \suffix, 8, 256, 128, 64
344 DECL_FFT \suffix, 9, 512, 256, 128
345 DECL_FFT \suffix,10, 1024, 512, 256
346 DECL_FFT \suffix,11, 2048, 1024, 512
347 DECL_FFT \suffix,12, 4096, 2048, 1024
348 DECL_FFT \suffix,13, 8192, 4096, 2048
349 DECL_FFT \suffix,14,16384, 8192, 4096
350 DECL_FFT \suffix,15,32768,16384, 8192
351 DECL_FFT \suffix,16,65536,32768,16384
352
353 .rodata
354 .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec
355 EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec:
356 PTR fft4\suffix\()_altivec
357 PTR fft8\suffix\()_altivec
358 PTR fft16\suffix\()_altivec
359 PTR fft32\suffix\()_altivec
360 PTR fft64\suffix\()_altivec
361 PTR fft128\suffix\()_altivec
362 PTR fft256\suffix\()_altivec
363 PTR fft512\suffix\()_altivec
364 PTR fft1024\suffix\()_altivec
365 PTR fft2048\suffix\()_altivec
366 PTR fft4096\suffix\()_altivec
367 PTR fft8192\suffix\()_altivec
368 PTR fft16384\suffix\()_altivec
369 PTR fft32768\suffix\()_altivec
370 PTR fft65536\suffix\()_altivec
371 .endm
372
373 DECL_FFTS 0
374 DECL_FFTS 1, _interleave