arm: Use .data.rel.ro for const data with relocations
[libav.git] / libavcodec / arm / fft_vfp.S
CommitLineData
8b9eba66
MS
1/*
2 * Copyright (c) 2013 RISC OS Open Ltd
3 * Author: Ben Avison <bavison@riscosopen.org>
4 *
5 * This file is part of Libav.
6 *
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/arm/asm.S"
23
87552d54
BA
24@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
25@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
26@ all single-precision VFP registers may be corrupted on exit. The a2
27@ register may not be clobbered in these functions, as it holds the
28@ stored original FPSCR.
29
30function ff_fft_calc_vfp, export=1
31 ldr ip, [a1, #0] @ nbits
32 mov a1, a2
b280c620
MS
33 movrel a2, (fft_tab_vfp - 8)
34 ldr pc, [a2, ip, lsl #2]
35endfunc
f963f803 36const fft_tab_vfp, relocate=1
87552d54
BA
37 .word fft4_vfp
38 .word fft8_vfp
39 .word X(ff_fft16_vfp) @ this one alone is exported
40 .word fft32_vfp
41 .word fft64_vfp
42 .word fft128_vfp
43 .word fft256_vfp
44 .word fft512_vfp
45 .word fft1024_vfp
46 .word fft2048_vfp
47 .word fft4096_vfp
48 .word fft8192_vfp
49 .word fft16384_vfp
50 .word fft32768_vfp
51 .word fft65536_vfp
b280c620 52endconst
8b9eba66
MS
53
54function fft4_vfp
55 vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
56 vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
57 vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
58 vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
59 @ stall
60 vadd.f s12, s0, s8 @ i0
61 vadd.f s13, s1, s9 @ i1
62 vadd.f s14, s2, s10 @ i2
63 vadd.f s15, s3, s11 @ i3
64 vsub.f s8, s0, s8 @ i4
65 vsub.f s9, s1, s9 @ i5
66 vsub.f s10, s2, s10 @ i6
67 vsub.f s11, s3, s11 @ i7
68 @ stall
69 @ stall
70 vadd.f s0, s12, s14 @ z[0].re
71 vsub.f s4, s12, s14 @ z[2].re
72 vadd.f s1, s13, s15 @ z[0].im
73 vsub.f s5, s13, s15 @ z[2].im
74 vadd.f s7, s9, s10 @ z[3].im
75 vsub.f s3, s9, s10 @ z[1].im
76 vadd.f s2, s8, s11 @ z[1].re
77 vsub.f s6, s8, s11 @ z[3].re
78 @ stall
79 @ stall
80 vstr d0, [a1, #0*2*4]
81 vstr d2, [a1, #2*2*4]
82 @ stall
83 @ stall
84 vstr d1, [a1, #1*2*4]
85 vstr d3, [a1, #3*2*4]
86
87 bx lr
88endfunc
89
90.macro macro_fft8_head
91 @ FFT4
92 vldr d4, [a1, #0 * 2*4]
93 vldr d6, [a1, #1 * 2*4]
94 vldr d5, [a1, #2 * 2*4]
95 vldr d7, [a1, #3 * 2*4]
96 @ BF
97 vldr d12, [a1, #4 * 2*4]
98 vadd.f s16, s8, s12 @ vector op
99 vldr d14, [a1, #5 * 2*4]
100 vldr d13, [a1, #6 * 2*4]
101 vldr d15, [a1, #7 * 2*4]
102 vsub.f s20, s8, s12 @ vector op
103 vadd.f s0, s16, s18
104 vsub.f s2, s16, s18
105 vadd.f s1, s17, s19
106 vsub.f s3, s17, s19
107 vadd.f s7, s21, s22
108 vsub.f s5, s21, s22
109 vadd.f s4, s20, s23
110 vsub.f s6, s20, s23
111 vsub.f s20, s24, s28 @ vector op
112 vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
113 vstr d1, [a1, #1 * 2*4]
114 vldr s0, cos1pi4
115 vadd.f s16, s24, s28 @ vector op
116 vstr d2, [a1, #2 * 2*4]
117 vstr d3, [a1, #3 * 2*4]
118 vldr d12, [a1, #0 * 2*4]
119 @ TRANSFORM
120 vmul.f s20, s20, s0 @ vector x scalar op
121 vldr d13, [a1, #1 * 2*4]
122 vldr d14, [a1, #2 * 2*4]
123 vldr d15, [a1, #3 * 2*4]
124 @ BUTTERFLIES
125 vadd.f s0, s18, s16
126 vadd.f s1, s17, s19
127 vsub.f s2, s17, s19
128 vsub.f s3, s18, s16
129 vadd.f s4, s21, s20
130 vsub.f s5, s21, s20
131 vadd.f s6, s22, s23
132 vsub.f s7, s22, s23
133 vadd.f s8, s0, s24 @ vector op
134 vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
135 vstr d1, [a1, #1 * 2*4]
136 vldr d6, [a1, #0 * 2*4]
137 vldr d7, [a1, #1 * 2*4]
138 vadd.f s1, s5, s6
139 vadd.f s0, s7, s4
140 vsub.f s2, s5, s6
141 vsub.f s3, s7, s4
142 vsub.f s12, s24, s12 @ vector op
143 vsub.f s5, s29, s1
144 vsub.f s4, s28, s0
145 vsub.f s6, s30, s2
146 vsub.f s7, s31, s3
147 vadd.f s16, s0, s28 @ vector op
148 vstr d6, [a1, #4 * 2*4]
149 vstr d7, [a1, #6 * 2*4]
150 vstr d4, [a1, #0 * 2*4]
151 vstr d5, [a1, #2 * 2*4]
152 vstr d2, [a1, #5 * 2*4]
153 vstr d3, [a1, #7 * 2*4]
154.endm
155
156.macro macro_fft8_tail
157 vstr d8, [a1, #1 * 2*4]
158 vstr d9, [a1, #3 * 2*4]
159.endm
160
87552d54
BA
161function .Lfft8_internal_vfp
162 macro_fft8_head
163 macro_fft8_tail
164 bx lr
165endfunc
166
8b9eba66
MS
167function fft8_vfp
168 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
169 fmrx a2, FPSCR
170 fmxr FPSCR, a3
171 vpush {s16-s31}
87552d54
BA
172 mov ip, lr
173 bl .Lfft8_internal_vfp
8b9eba66
MS
174 vpop {s16-s31}
175 fmxr FPSCR, a2
87552d54 176 bx ip
8b9eba66
MS
177endfunc
178
179.align 3
180cos1pi4: @ cos(1*pi/4) = sqrt(2)
181 .float 0.707106769084930419921875
182cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
183 .float 0.92387950420379638671875
184cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
185 .float 0.3826834261417388916015625
186
87552d54 187function .Lfft16_internal_vfp
8b9eba66
MS
188 macro_fft8_head
189 @ FFT4(z+8)
190 vldr d10, [a1, #8 * 2*4]
191 vldr d12, [a1, #9 * 2*4]
192 vldr d11, [a1, #10 * 2*4]
193 vldr d13, [a1, #11 * 2*4]
194 macro_fft8_tail
195 vadd.f s16, s20, s24 @ vector op
196 @ FFT4(z+12)
197 vldr d4, [a1, #12 * 2*4]
198 vldr d6, [a1, #13 * 2*4]
199 vldr d5, [a1, #14 * 2*4]
200 vsub.f s20, s20, s24 @ vector op
201 vldr d7, [a1, #15 * 2*4]
202 vadd.f s0, s16, s18
203 vsub.f s4, s16, s18
204 vadd.f s1, s17, s19
205 vsub.f s5, s17, s19
206 vadd.f s7, s21, s22
207 vsub.f s3, s21, s22
208 vadd.f s2, s20, s23
209 vsub.f s6, s20, s23
210 vadd.f s16, s8, s12 @ vector op
211 vstr d0, [a1, #8 * 2*4]
212 vstr d2, [a1, #10 * 2*4]
213 vstr d1, [a1, #9 * 2*4]
214 vsub.f s20, s8, s12
215 vstr d3, [a1, #11 * 2*4]
216 @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
217 vldr d12, [a1, #10 * 2*4]
218 vadd.f s0, s16, s18
219 vadd.f s1, s17, s19
220 vsub.f s6, s16, s18
221 vsub.f s7, s17, s19
222 vsub.f s3, s21, s22
223 vadd.f s2, s20, s23
224 vadd.f s5, s21, s22
225 vsub.f s4, s20, s23
226 vstr d0, [a1, #12 * 2*4]
227 vmov s0, s6
228 @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
229 vldr d6, [a1, #9 * 2*4]
230 vstr d1, [a1, #13 * 2*4]
231 vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
232 vstr d2, [a1, #15 * 2*4]
233 vldr d7, [a1, #13 * 2*4]
234 vadd.f s4, s25, s24
235 vsub.f s5, s25, s24
236 vsub.f s6, s0, s7
237 vadd.f s7, s0, s7
238 vmul.f s20, s12, s3 @ vector op
239 @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
240 vldr d4, [a1, #11 * 2*4]
241 vldr d5, [a1, #15 * 2*4]
242 vldr s1, cos3pi8
243 vmul.f s24, s4, s2 @ vector * scalar op
244 vmul.f s28, s12, s1 @ vector * scalar op
245 vmul.f s12, s8, s1 @ vector * scalar op
246 vadd.f s4, s20, s29
247 vsub.f s5, s21, s28
248 vsub.f s6, s22, s31
249 vadd.f s7, s23, s30
250 vmul.f s8, s8, s3 @ vector * scalar op
251 vldr d8, [a1, #1 * 2*4]
252 vldr d9, [a1, #5 * 2*4]
253 vldr d10, [a1, #3 * 2*4]
254 vldr d11, [a1, #7 * 2*4]
255 vldr d14, [a1, #2 * 2*4]
256 vadd.f s0, s6, s4
257 vadd.f s1, s5, s7
258 vsub.f s2, s5, s7
259 vsub.f s3, s6, s4
260 vadd.f s4, s12, s9
261 vsub.f s5, s13, s8
262 vsub.f s6, s14, s11
263 vadd.f s7, s15, s10
264 vadd.f s12, s0, s16 @ vector op
265 vstr d0, [a1, #1 * 2*4]
266 vstr d1, [a1, #5 * 2*4]
267 vldr d4, [a1, #1 * 2*4]
268 vldr d5, [a1, #5 * 2*4]
269 vadd.f s0, s6, s4
270 vadd.f s1, s5, s7
271 vsub.f s2, s5, s7
272 vsub.f s3, s6, s4
273 vsub.f s8, s16, s8 @ vector op
274 vstr d6, [a1, #1 * 2*4]
275 vstr d7, [a1, #5 * 2*4]
276 vldr d15, [a1, #6 * 2*4]
277 vsub.f s4, s20, s0
278 vsub.f s5, s21, s1
279 vsub.f s6, s22, s2
280 vsub.f s7, s23, s3
281 vadd.f s20, s0, s20 @ vector op
282 vstr d4, [a1, #9 * 2*4]
283 @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
284 vldr d6, [a1, #8 * 2*4]
285 vstr d5, [a1, #13 * 2*4]
286 vldr d7, [a1, #12 * 2*4]
287 vstr d2, [a1, #11 * 2*4]
288 vldr d8, [a1, #0 * 2*4]
289 vstr d3, [a1, #15 * 2*4]
290 vldr d9, [a1, #4 * 2*4]
291 vadd.f s0, s26, s24
292 vadd.f s1, s25, s27
293 vsub.f s2, s25, s27
294 vsub.f s3, s26, s24
295 vadd.f s4, s14, s12
296 vadd.f s5, s13, s15
297 vsub.f s6, s13, s15
298 vsub.f s7, s14, s12
299 vadd.f s8, s0, s28 @ vector op
300 vstr d0, [a1, #3 * 2*4]
301 vstr d1, [a1, #7 * 2*4]
302 vldr d6, [a1, #3 * 2*4]
303 vldr d7, [a1, #7 * 2*4]
304 vsub.f s0, s16, s4
305 vsub.f s1, s17, s5
306 vsub.f s2, s18, s6
307 vsub.f s3, s19, s7
308 vsub.f s12, s28, s12 @ vector op
309 vadd.f s16, s4, s16 @ vector op
310 vstr d10, [a1, #3 * 2*4]
311 vstr d11, [a1, #7 * 2*4]
312 vstr d4, [a1, #2 * 2*4]
313 vstr d5, [a1, #6 * 2*4]
314 vstr d0, [a1, #8 * 2*4]
315 vstr d1, [a1, #12 * 2*4]
316 vstr d6, [a1, #10 * 2*4]
317 vstr d7, [a1, #14 * 2*4]
318 vstr d8, [a1, #0 * 2*4]
319 vstr d9, [a1, #4 * 2*4]
320
87552d54
BA
321 bx lr
322endfunc
323
324function ff_fft16_vfp, export=1
325 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
326 fmrx a2, FPSCR
327 fmxr FPSCR, a3
328 vpush {s16-s31}
329 mov ip, lr
330 bl .Lfft16_internal_vfp
8b9eba66
MS
331 vpop {s16-s31}
332 fmxr FPSCR, a2
87552d54
BA
333 bx ip
334endfunc
335
336.macro pass n, z0, z1, z2, z3
337 add v6, v5, #4*2*\n
338 @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
339 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
340 @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
341 @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
342 vldr d8, [\z2, #8*(o2+1)] @ s16,s17
343 vldmdb v6!, {s2}
344 vldr d9, [\z3, #8*(o3+1)] @ s18,s19
345 vldmia v5!, {s0,s1} @ s0 is unused
346 vldr s7, [\z2, #8*o2] @ t1
347 vmul.f s20, s16, s2 @ vector * scalar
348 vldr s0, [\z3, #8*o3] @ t5
349 vldr s6, [\z2, #8*o2+4] @ t2
350 vldr s3, [\z3, #8*o3+4] @ t6
351 vmul.f s16, s16, s1 @ vector * scalar
352 ldr a4, =\n-1
3531: add \z0, \z0, #8*2
354 .if \n*4*2 >= 512
355 add \z1, \z1, #8*2
356 .endif
357 .if \n*4*2 >= 256
358 add \z2, \z2, #8*2
359 .endif
360 .if \n*4*2 >= 512
361 add \z3, \z3, #8*2
362 .endif
363 @ up to 2 stalls (VFP vector issuing / waiting for s0)
364 @ depending upon whether this is the first iteration and
365 @ how many add instructions are inserted above
366 vadd.f s4, s0, s7 @ t5
367 vadd.f s5, s6, s3 @ t6
368 vsub.f s6, s6, s3 @ t4
369 vsub.f s7, s0, s7 @ t3
370 vldr d6, [\z0, #8*0-8*2] @ s12,s13
371 vadd.f s0, s16, s21 @ t1
372 vldr d7, [\z1, #8*o1-8*2] @ s14,s15
373 vsub.f s1, s18, s23 @ t5
374 vadd.f s8, s4, s12 @ vector + vector
375 @ stall (VFP vector issuing)
376 @ stall (VFP vector issuing)
377 @ stall (VFP vector issuing)
378 vsub.f s4, s12, s4
379 vsub.f s5, s13, s5
380 vsub.f s6, s14, s6
381 vsub.f s7, s15, s7
382 vsub.f s2, s17, s20 @ t2
383 vadd.f s3, s19, s22 @ t6
384 vstr d4, [\z0, #8*0-8*2] @ s8,s9
385 vstr d5, [\z1, #8*o1-8*2] @ s10,s11
386 @ stall (waiting for s5)
387 vstr d2, [\z2, #8*o2-8*2] @ s4,s5
388 vadd.f s4, s1, s0 @ t5
389 vstr d3, [\z3, #8*o3-8*2] @ s6,s7
390 vsub.f s7, s1, s0 @ t3
391 vadd.f s5, s2, s3 @ t6
392 vsub.f s6, s2, s3 @ t4
393 vldr d6, [\z0, #8*1-8*2] @ s12,s13
394 vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
395 vldr d4, [\z2, #8*o2] @ s8,s9
396 vldmdb v6!, {s2,s3}
397 vldr d5, [\z3, #8*o3] @ s10,s11
398 vadd.f s20, s4, s12 @ vector + vector
399 vldmia v5!, {s0,s1}
400 vldr d8, [\z2, #8*(o2+1)] @ s16,s17
401 @ stall (VFP vector issuing)
402 vsub.f s4, s12, s4
403 vsub.f s5, s13, s5
404 vsub.f s6, s14, s6
405 vsub.f s7, s15, s7
406 vmul.f s12, s8, s3 @ vector * scalar
407 vstr d10, [\z0, #8*1-8*2] @ s20,s21
408 vldr d9, [\z3, #8*(o3+1)] @ s18,s19
409 vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
410 vmul.f s8, s8, s0 @ vector * scalar
411 vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
412 @ stall (waiting for s7)
413 vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
414 vmul.f s20, s16, s2 @ vector * scalar
415 @ stall (VFP vector issuing)
416 @ stall (VFP vector issuing)
417 @ stall (VFP vector issuing)
418 vadd.f s7, s8, s13 @ t1
419 vsub.f s6, s9, s12 @ t2
420 vsub.f s0, s10, s15 @ t5
421 vadd.f s3, s11, s14 @ t6
422 vmul.f s16, s16, s1 @ vector * scalar
423 subs a4, a4, #1
424 bne 1b
425 @ What remains is identical to the first two indentations of
426 @ the above, but without the increment of z
427 vadd.f s4, s0, s7 @ t5
428 vadd.f s5, s6, s3 @ t6
429 vsub.f s6, s6, s3 @ t4
430 vsub.f s7, s0, s7 @ t3
431 vldr d6, [\z0, #8*0] @ s12,s13
432 vadd.f s0, s16, s21 @ t1
433 vldr d7, [\z1, #8*o1] @ s14,s15
434 vsub.f s1, s18, s23 @ t5
435 vadd.f s8, s4, s12 @ vector + vector
436 vsub.f s4, s12, s4
437 vsub.f s5, s13, s5
438 vsub.f s6, s14, s6
439 vsub.f s7, s15, s7
440 vsub.f s2, s17, s20 @ t2
441 vadd.f s3, s19, s22 @ t6
442 vstr d4, [\z0, #8*0] @ s8,s9
443 vstr d5, [\z1, #8*o1] @ s10,s11
444 vstr d2, [\z2, #8*o2] @ s4,s5
445 vadd.f s4, s1, s0 @ t5
446 vstr d3, [\z3, #8*o3] @ s6,s7
447 vsub.f s7, s1, s0 @ t3
448 vadd.f s5, s2, s3 @ t6
449 vsub.f s6, s2, s3 @ t4
450 vldr d6, [\z0, #8*1] @ s12,s13
451 vldr d7, [\z1, #8*(o1+1)] @ s14,s15
452 vadd.f s20, s4, s12 @ vector + vector
453 vsub.f s4, s12, s4
454 vsub.f s5, s13, s5
455 vsub.f s6, s14, s6
456 vsub.f s7, s15, s7
457 vstr d10, [\z0, #8*1] @ s20,s21
458 vstr d11, [\z1, #8*(o1+1)] @ s22,s23
459 vstr d2, [\z2, #8*(o2+1)] @ s4,s5
460 vstr d3, [\z3, #8*(o3+1)] @ s6,s7
461.endm
462
463.macro def_fft n, n2, n4
464function .Lfft\n\()_internal_vfp
465 .if \n >= 512
466 push {v1-v6,lr}
467 .elseif \n >= 256
468 push {v1-v2,v5-v6,lr}
469 .else
470 push {v1,v5-v6,lr}
471 .endif
472 mov v1, a1
473 bl .Lfft\n2\()_internal_vfp
474 add a1, v1, #8*(\n/4)*2
475 bl .Lfft\n4\()_internal_vfp
476 movrelx v5, X(ff_cos_\n), a1
477 add a1, v1, #8*(\n/4)*3
478 bl .Lfft\n4\()_internal_vfp
479 .if \n >= 512
480 .set o1, 0*(\n/4/2)
481 .set o2, 0*(\n/4/2)
482 .set o3, 0*(\n/4/2)
483 add v2, v1, #8*2*(\n/4/2)
484 add v3, v1, #8*4*(\n/4/2)
485 add v4, v1, #8*6*(\n/4/2)
486 pass (\n/4/2), v1, v2, v3, v4
487 pop {v1-v6,pc}
488 .elseif \n >= 256
489 .set o1, 2*(\n/4/2)
490 .set o2, 0*(\n/4/2)
491 .set o3, 2*(\n/4/2)
492 add v2, v1, #8*4*(\n/4/2)
493 pass (\n/4/2), v1, v1, v2, v2
494 pop {v1-v2,v5-v6,pc}
495 .else
496 .set o1, 2*(\n/4/2)
497 .set o2, 4*(\n/4/2)
498 .set o3, 6*(\n/4/2)
499 pass (\n/4/2), v1, v1, v1, v1
500 pop {v1,v5-v6,pc}
501 .endif
8b9eba66 502endfunc
87552d54
BA
503
504function fft\n\()_vfp
505 ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
506 fmrx a2, FPSCR
507 fmxr FPSCR, a3
508 vpush {s16-s31}
509 mov ip, lr
510 bl .Lfft\n\()_internal_vfp
511 vpop {s16-s31}
512 fmxr FPSCR, a2
513 bx ip
514endfunc
515
516.ltorg
517.endm
518
519 def_fft 32, 16, 8
520 def_fft 64, 32, 16
521 def_fft 128, 64, 32
522 def_fft 256, 128, 64
523 def_fft 512, 256, 128
524 def_fft 1024, 512, 256
525 def_fft 2048, 1024, 512
526 def_fft 4096, 2048, 1024
527 def_fft 8192, 4096, 2048
528 def_fft 16384, 8192, 4096
529 def_fft 32768, 16384, 8192
530 def_fft 65536, 32768, 16384