aarch64: Add assembly support for -fsanitize=hwaddress tagged globals.
[libav.git] / libavcodec / arm / fft_fixed_neon.S
CommitLineData
dba98529
MR
1/*
2 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
94d2b0d2 21#include "libavutil/arm/asm.S"
dba98529
MR
22
23.macro bflies d0, d1, r0, r1
24 vrev64.32 \r0, \d1 @ t5, t6, t1, t2
25 vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2
26 vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2
27 vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5
28 vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1
29 @ t5, t6, t4, t3
30 vhsub.s16 \d1, \d0, \r0
31 vhadd.s16 \d0, \d0, \r0
32.endm
33
34.macro transform01 q0, q1, d3, c0, c1, r0, w0, w1
35 vrev32.16 \r0, \d3
36 vmull.s16 \w0, \d3, \c0
37 vmlal.s16 \w0, \r0, \c1
38 vshrn.s32 \d3, \w0, #15
39 bflies \q0, \q1, \w0, \w1
40.endm
41
42.macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \
43 r0, r1, w0, w1
44 vrev32.16 \r0, \d1
45 vrev32.16 \r1, \d3
46 vmull.s16 \w0, \d1, \c0
47 vmlal.s16 \w0, \r0, \c1
48 vmull.s16 \w1, \d3, \c2
49 vmlal.s16 \w1, \r1, \c3
50 vshrn.s32 \d1, \w0, #15
51 vshrn.s32 \d3, \w1, #15
52 bflies \q0, \q1, \w0, \w1
53.endm
54
55.macro fft4 d0, d1, r0, r1
56 vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7
57 vhsub.s16 \r1, \d1, \d0
58 vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5
fce1e434 59 vmov.i64 \d1, #0xffff00000000
dba98529
MR
60 vbit \r0, \r1, \d1
61 vrev64.16 \r1, \r0 @ t7, t8, t4, t3
62 vtrn.32 \r0, \r1 @ t3, t4, t7, t8
63 vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7
64 vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1
65 vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3
66.endm
67
68.macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1
69 fft4 \d0, \d1, \r0, \r1
70 vtrn.32 \d0, \d1 @ z0, z2, z1, z3
71 vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4
72 vhsub.s16 \d3, \d2, \d3 @ z5, z7
73 vmov \d2, \r0
74 transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
75.endm
76
77function fft4_neon
3824ef08 78 vld1.16 {d0-d1}, [r0]
dba98529 79 fft4 d0, d1, d2, d3
3824ef08 80 vst1.16 {d0-d1}, [r0]
dba98529
MR
81 bx lr
82endfunc
83
84function fft8_neon
85 vld1.16 {d0-d3}, [r0,:128]
86 movrel r1, coefs
87 vld1.16 {d30}, [r1,:64]
88 vdup.16 d31, d30[0]
89 fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9
90 vtrn.32 d0, d1
91 vtrn.32 d2, d3
92 vst1.16 {d0-d3}, [r0,:128]
93 bx lr
94endfunc
95
96function fft16_neon
97 vld1.16 {d0-d3}, [r0,:128]!
98 vld1.16 {d4-d7}, [r0,:128]
99 movrel r1, coefs
100 sub r0, r0, #32
101 vld1.16 {d28-d31},[r1,:128]
102 vdup.16 d31, d28[0]
103 fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9
104 vswp d5, d6
105 fft4 q2, q3, q8, q9
106 vswp d5, d6
107 vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7
108 vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15
109 vswp d1, d2
110 vdup.16 d31, d28[0]
111 transform01 q0, q2, d5, d31, d28, d20, q8, q9
112 vdup.16 d26, d29[0]
113 vdup.16 d27, d30[0]
114 transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \
115 d20, d21, q8, q9
116 vtrn.32 q0, q1
117 vtrn.32 q2, q3
118 vst1.16 {d0-d3}, [r0,:128]!
119 vst1.16 {d4-d7}, [r0,:128]
120 bx lr
121endfunc
122
123function fft_pass_neon
124 push {r4,lr}
125 movrel lr, coefs + 24
126 vld1.16 {d30}, [lr,:64]
127 lsl r12, r2, #3
128 vmov d31, d30
129 add r3, r1, r2, lsl #2
130 mov lr, #-8
131 sub r3, r3, #2
132 mov r4, r0
133 vld1.16 {d27[]}, [r3,:16]
134 sub r3, r3, #6
135 vld1.16 {q0}, [r4,:128], r12
136 vld1.16 {q1}, [r4,:128], r12
137 vld1.16 {q2}, [r4,:128], r12
138 vld1.16 {q3}, [r4,:128], r12
139 vld1.16 {d28}, [r1,:64]!
140 vld1.16 {d29}, [r3,:64], lr
141 vswp d1, d2
142 vswp d5, d6
143 vtrn.32 d0, d1
144 vtrn.32 d4, d5
145 vdup.16 d25, d28[1]
146 vmul.s16 d27, d27, d31
147 transform01 q0, q2, d5, d25, d27, d20, q8, q9
148 b 2f
1491:
150 mov r4, r0
151 vdup.16 d26, d29[0]
152 vld1.16 {q0}, [r4,:128], r12
153 vld1.16 {q1}, [r4,:128], r12
154 vld1.16 {q2}, [r4,:128], r12
155 vld1.16 {q3}, [r4,:128], r12
156 vld1.16 {d28}, [r1,:64]!
157 vld1.16 {d29}, [r3,:64], lr
158 vswp d1, d2
159 vswp d5, d6
160 vtrn.32 d0, d1
161 vtrn.32 d4, d5
162 vdup.16 d24, d28[0]
163 vdup.16 d25, d28[1]
164 vdup.16 d27, d29[3]
165 vmul.s16 q13, q13, q15
166 transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \
167 d16, d17, q9, q10
1682:
169 vtrn.32 d2, d3
170 vtrn.32 d6, d7
171 vdup.16 d24, d28[2]
172 vdup.16 d26, d29[2]
173 vdup.16 d25, d28[3]
174 vdup.16 d27, d29[1]
175 vmul.s16 q13, q13, q15
176 transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \
177 d16, d17, q9, q10
178 vtrn.32 d0, d1
179 vtrn.32 d2, d3
180 vtrn.32 d4, d5
181 vtrn.32 d6, d7
182 vswp d1, d2
183 vswp d5, d6
184 mov r4, r0
185 vst1.16 {q0}, [r4,:128], r12
186 vst1.16 {q1}, [r4,:128], r12
187 vst1.16 {q2}, [r4,:128], r12
188 vst1.16 {q3}, [r4,:128], r12
189 add r0, r0, #16
190 subs r2, r2, #2
191 bgt 1b
192 pop {r4,pc}
193endfunc
194
195#define F_SQRT1_2 23170
196#define F_COS_16_1 30274
197#define F_COS_16_3 12540
198
199const coefs, align=4
200 .short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2
201 .short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
202 .short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
203 .short 1, -1, -1, 1
204endconst
205
206.macro def_fft n, n2, n4
207function fft\n\()_neon
208 push {r4, lr}
209 mov r4, r0
210 bl fft\n2\()_neon
211 add r0, r4, #\n4*2*4
212 bl fft\n4\()_neon
213 add r0, r4, #\n4*3*4
214 bl fft\n4\()_neon
215 mov r0, r4
216 pop {r4, lr}
62634158 217 movrelx r1, X(ff_cos_\n\()_fixed)
dba98529
MR
218 mov r2, #\n4/2
219 b fft_pass_neon
220endfunc
221.endm
222
223 def_fft 32, 16, 8
224 def_fft 64, 32, 16
225 def_fft 128, 64, 32
226 def_fft 256, 128, 64
227 def_fft 512, 256, 128
228 def_fft 1024, 512, 256
229 def_fft 2048, 1024, 512
230 def_fft 4096, 2048, 1024
231 def_fft 8192, 4096, 2048
232 def_fft 16384, 8192, 4096
233 def_fft 32768, 16384, 8192
234 def_fft 65536, 32768, 16384
235
236function ff_fft_fixed_calc_neon, export=1
237 ldr r2, [r0]
238 sub r2, r2, #2
239 movrel r3, fft_fixed_tab_neon
240 ldr r3, [r3, r2, lsl #2]
241 mov r0, r1
242 bx r3
243endfunc
244
f963f803 245const fft_fixed_tab_neon, relocate=1
dba98529
MR
246 .word fft4_neon
247 .word fft8_neon
248 .word fft16_neon
249 .word fft32_neon
250 .word fft64_neon
251 .word fft128_neon
252 .word fft256_neon
253 .word fft512_neon
254 .word fft1024_neon
255 .word fft2048_neon
256 .word fft4096_neon
257 .word fft8192_neon
258 .word fft16384_neon
259 .word fft32768_neon
260 .word fft65536_neon
261endconst