PPC: add some asm support macros
[libav.git] / libavcodec / ppc / fft_altivec_s.S
1 /*
2 * FFT transform with Altivec optimizations
3 * Copyright (c) 2009 Loren Merritt
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 /*
23 * These functions are not individually interchangeable with the C versions.
24 * While C takes arrays of FFTComplex, Altivec leaves intermediate results
25 * in blocks as convenient to the vector size.
26 * i.e. {4x real, 4x imaginary, 4x real, ...}
27 *
28 * I ignore standard calling convention.
29 * Instead, the following registers are treated as global constants:
30 * v14: zero
31 * v15..v18: cosines
32 * v19..v29: permutations
33 * r9: 16
34 * r12: ff_cos_tabs
35 * and the rest are free for local use.
36 */
37
38 #include "config.h"
39 #include "asm.S"
40
41 .text
42
43 /* Apple gas doesn't support this shorthand */
44 .macro mtctr rx
45 mtspr 9, \rx
46 .endm
47
48 .macro addi2 ra, imm // add 32-bit immediate
49 .if \imm & 0xffff
50 addi \ra, \ra, \imm@l
51 .endif
52 .if (\imm+0x8000)>>16
53 addis \ra, \ra, \imm@ha
54 .endif
55 .endm
56
57 #if ARCH_PPC64
58 #define PTR .quad
59 .macro LOAD_PTR ra, rbase, offset
60 ld \ra,(\offset)*8(\rbase)
61 .endm
62 .macro STORE_PTR ra, rbase, offset
63 std \ra,(\offset)*8(\rbase)
64 .endm
65 #else
66 #define PTR .int
67 .macro LOAD_PTR ra, rbase, offset
68 lwz \ra,(\offset)*4(\rbase)
69 .endm
70 .macro STORE_PTR ra, rbase, offset
71 stw \ra,(\offset)*4(\rbase)
72 .endm
73 #endif
74
75 .macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
76 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
77 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
78 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
79 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
80 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
81 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
82 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
83 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
84 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
85 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
86 .endm
87
88 .macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
89 vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
90 vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
91 vperm \b2,\b0,\b1,v20
92 vperm \b3,\b0,\b1,v21
93 vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
94 vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
95 vaddfp \b0,\b2,\b3
96 vsubfp \b1,\b2,\b3
97 vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
98 vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
99 vmrghw \b2,\b0,\b1
100 vperm \b3,\b0,\b1,v22
101 vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
102 vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
103 vaddfp \b0,\b2,\b3
104 vsubfp \b1,\b2,\b3
105 vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
106 vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
107 vperm \b2,\b0,\b1,v23
108 vperm \b3,\b0,\b1,v24
109 .endm
110
111 .macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
112 vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
113 vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
114 vperm \a2,\a0,\a1,v20 // FFT4 ...
115 vperm \a3,\a0,\a1,v21
116 vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
117 vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
118 vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
119 vaddfp \a0,\a2,\a3
120 vsubfp \a1,\a2,\a3
121 vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
122 vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
123 vmrghw \a2,\a0,\a1
124 vperm \a3,\a0,\a1,v22
125 vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
126 vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
127 vaddfp \a0,\a2,\a3
128 vsubfp \a1,\a2,\a3
129 vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
130 vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
131 vperm \a2,\a0,\a1,v23
132 vperm \a3,\a0,\a1,v24
133 vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
134 vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
135 vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
136 vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
137 vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
138 vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
139 .endm
140
141 .macro BF d0,d1,s0,s1
142 vsubfp \d1,\s0,\s1
143 vaddfp \d0,\s0,\s1
144 .endm
145
146 .macro zip d0,d1,s0,s1
147 vmrghw \d0,\s0,\s1
148 vmrglw \d1,\s0,\s1
149 .endm
150
151 .macro def_fft4 interleave
152 fft4\interleave\()_altivec:
153 lvx v0, 0,r3
154 lvx v1,r9,r3
155 FFT4 v0,v1,v2,v3
156 .ifnb \interleave
157 zip v0,v1,v2,v3
158 stvx v0, 0,r3
159 stvx v1,r9,r3
160 .else
161 stvx v2, 0,r3
162 stvx v3,r9,r3
163 .endif
164 blr
165 .endm
166
167 .macro def_fft8 interleave
168 fft8\interleave\()_altivec:
169 addi r4,r3,32
170 lvx v0, 0,r3
171 lvx v1,r9,r3
172 lvx v2, 0,r4
173 lvx v3,r9,r4
174 FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
175 .ifnb \interleave
176 zip v4,v5,v0,v1
177 zip v6,v7,v2,v3
178 stvx v4, 0,r3
179 stvx v5,r9,r3
180 stvx v6, 0,r4
181 stvx v7,r9,r4
182 .else
183 stvx v0, 0,r3
184 stvx v1,r9,r3
185 stvx v2, 0,r4
186 stvx v3,r9,r4
187 .endif
188 blr
189 .endm
190
191 .macro def_fft16 interleave
192 fft16\interleave\()_altivec:
193 addi r5,r3,64
194 addi r6,r3,96
195 addi r4,r3,32
196 lvx v0, 0,r5
197 lvx v1,r9,r5
198 lvx v2, 0,r6
199 lvx v3,r9,r6
200 FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
201 lvx v0, 0,r3
202 lvx v1,r9,r3
203 lvx v2, 0,r4
204 lvx v3,r9,r4
205 FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
206 vmaddfp v8,v4,v15,v14 // r2*wre
207 vmaddfp v9,v5,v15,v14 // i2*wre
208 vmaddfp v10,v6,v15,v14 // r3*wre
209 vmaddfp v11,v7,v15,v14 // i3*wre
210 vmaddfp v8,v5,v16,v8 // i2*wim
211 vnmsubfp v9,v4,v16,v9 // r2*wim
212 vnmsubfp v10,v7,v16,v10 // i3*wim
213 vmaddfp v11,v6,v16,v11 // r3*wim
214 BF v10,v12,v10,v8
215 BF v11,v13,v9,v11
216 BF v0,v4,v0,v10
217 BF v3,v7,v3,v12
218 BF v1,v5,v1,v11
219 BF v2,v6,v2,v13
220 .ifnb \interleave
221 zip v8, v9,v0,v1
222 zip v10,v11,v2,v3
223 zip v12,v13,v4,v5
224 zip v14,v15,v6,v7
225 stvx v8, 0,r3
226 stvx v9,r9,r3
227 stvx v10, 0,r4
228 stvx v11,r9,r4
229 stvx v12, 0,r5
230 stvx v13,r9,r5
231 stvx v14, 0,r6
232 stvx v15,r9,r6
233 .else
234 stvx v0, 0,r3
235 stvx v4, 0,r5
236 stvx v3,r9,r4
237 stvx v7,r9,r6
238 stvx v1,r9,r3
239 stvx v5,r9,r5
240 stvx v2, 0,r4
241 stvx v6, 0,r6
242 .endif
243 blr
244 .endm
245
246 // void pass(float *z, float *wre, int n)
247 .macro PASS interleave, suffix
248 fft_pass\suffix\()_altivec:
249 mtctr r5
250 slwi r0,r5,4
251 slwi r7,r5,6 // o2
252 slwi r5,r5,5 // o1
253 add r10,r5,r7 // o3
254 add r0,r4,r0 // wim
255 addi r6,r5,16 // o1+16
256 addi r8,r7,16 // o2+16
257 addi r11,r10,16 // o3+16
258 1:
259 lvx v8, 0,r4 // wre
260 lvx v10, 0,r0 // wim
261 sub r0,r0,r9
262 lvx v9, 0,r0
263 vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
264 lvx v4,r3,r7 // r2 = z[o2]
265 lvx v5,r3,r8 // i2 = z[o2+16]
266 lvx v6,r3,r10 // r3 = z[o3]
267 lvx v7,r3,r11 // i3 = z[o3+16]
268 vmaddfp v10,v4,v8,v14 // r2*wre
269 vmaddfp v11,v5,v8,v14 // i2*wre
270 vmaddfp v12,v6,v8,v14 // r3*wre
271 vmaddfp v13,v7,v8,v14 // i3*wre
272 lvx v0, 0,r3 // r0 = z[0]
273 lvx v3,r3,r6 // i1 = z[o1+16]
274 vmaddfp v10,v5,v9,v10 // i2*wim
275 vnmsubfp v11,v4,v9,v11 // r2*wim
276 vnmsubfp v12,v7,v9,v12 // i3*wim
277 vmaddfp v13,v6,v9,v13 // r3*wim
278 lvx v1,r3,r9 // i0 = z[16]
279 lvx v2,r3,r5 // r1 = z[o1]
280 BF v12,v8,v12,v10
281 BF v13,v9,v11,v13
282 BF v0,v4,v0,v12
283 BF v3,v7,v3,v8
284 .if !\interleave
285 stvx v0, 0,r3
286 stvx v4,r3,r7
287 stvx v3,r3,r6
288 stvx v7,r3,r11
289 .endif
290 BF v1,v5,v1,v13
291 BF v2,v6,v2,v9
292 .if !\interleave
293 stvx v1,r3,r9
294 stvx v2,r3,r5
295 stvx v5,r3,r8
296 stvx v6,r3,r10
297 .else
298 vmrghw v8,v0,v1
299 vmrglw v9,v0,v1
300 stvx v8, 0,r3
301 stvx v9,r3,r9
302 vmrghw v8,v2,v3
303 vmrglw v9,v2,v3
304 stvx v8,r3,r5
305 stvx v9,r3,r6
306 vmrghw v8,v4,v5
307 vmrglw v9,v4,v5
308 stvx v8,r3,r7
309 stvx v9,r3,r8
310 vmrghw v8,v6,v7
311 vmrglw v9,v6,v7
312 stvx v8,r3,r10
313 stvx v9,r3,r11
314 .endif
315 addi r3,r3,32
316 addi r4,r4,16
317 bdnz 1b
318 sub r3,r3,r5
319 blr
320 .endm
321
322 .macro DECL_FFT suffix, bits, n, n2, n4
323 fft\n\suffix\()_altivec:
324 mflr r0
325 STORE_PTR r0,r1,\bits-5
326 bl fft\n2\()_altivec
327 addi2 r3,\n*4
328 bl fft\n4\()_altivec
329 addi2 r3,\n*2
330 bl fft\n4\()_altivec
331 addi2 r3,\n*-6
332 LOAD_PTR r0,r1,\bits-5
333 LOAD_PTR r4,r12,\bits
334 mtlr r0
335 li r5,\n/16
336 b fft_pass\suffix\()_altivec
337 .endm
338
339 .macro DECL_FFTS interleave, suffix
340 .text
341 def_fft4 \suffix
342 def_fft8 \suffix
343 def_fft16 \suffix
344 PASS \interleave, \suffix
345 DECL_FFT \suffix, 5, 32, 16, 8
346 DECL_FFT \suffix, 6, 64, 32, 16
347 DECL_FFT \suffix, 7, 128, 64, 32
348 DECL_FFT \suffix, 8, 256, 128, 64
349 DECL_FFT \suffix, 9, 512, 256, 128
350 DECL_FFT \suffix,10, 1024, 512, 256
351 DECL_FFT \suffix,11, 2048, 1024, 512
352 DECL_FFT \suffix,12, 4096, 2048, 1024
353 DECL_FFT \suffix,13, 8192, 4096, 2048
354 DECL_FFT \suffix,14,16384, 8192, 4096
355 DECL_FFT \suffix,15,32768,16384, 8192
356 DECL_FFT \suffix,16,65536,32768,16384
357
358 .rodata
359 .global EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec
360 EXTERN_ASM\()ff_fft_dispatch\suffix\()_altivec:
361 PTR fft4\suffix\()_altivec
362 PTR fft8\suffix\()_altivec
363 PTR fft16\suffix\()_altivec
364 PTR fft32\suffix\()_altivec
365 PTR fft64\suffix\()_altivec
366 PTR fft128\suffix\()_altivec
367 PTR fft256\suffix\()_altivec
368 PTR fft512\suffix\()_altivec
369 PTR fft1024\suffix\()_altivec
370 PTR fft2048\suffix\()_altivec
371 PTR fft4096\suffix\()_altivec
372 PTR fft8192\suffix\()_altivec
373 PTR fft16384\suffix\()_altivec
374 PTR fft32768\suffix\()_altivec
375 PTR fft65536\suffix\()_altivec
376 .endm
377
378 DECL_FFTS 0
379 DECL_FFTS 1, _interleave