ARM: apply extern symbol prefix where needed
[libav.git] / libavcodec / arm / mdct_neon.S
1 /*
2 * ARM NEON optimised MDCT
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "asm.S"
23
24 .fpu neon
25 .text
26
27 #define ff_fft_calc_neon X(ff_fft_calc_neon)
28
29 function ff_imdct_half_neon, export=1
30 push {r4-r8,lr}
31
32 mov r12, #1
33 ldr lr, [r0, #28] @ mdct_bits
34 ldr r4, [r0, #32] @ tcos
35 ldr r3, [r0, #8] @ revtab
36 lsl r12, r12, lr @ n = 1 << nbits
37 lsr lr, r12, #2 @ n4 = n >> 2
38 add r7, r2, r12, lsl #1
39 mov r12, #-16
40 sub r7, r7, #16
41
42 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
43 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
44 vrev64.32 d17, d17
45 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
46 vmul.f32 d6, d17, d2
47 vmul.f32 d7, d0, d2
48 1:
49 subs lr, lr, #2
50 ldr r6, [r3], #4
51 vmul.f32 d4, d0, d3
52 vmul.f32 d5, d17, d3
53 vsub.f32 d4, d6, d4
54 vadd.f32 d5, d5, d7
55 uxth r8, r6, ror #16
56 uxth r6, r6
57 add r8, r1, r8, lsl #3
58 add r6, r1, r6, lsl #3
59 beq 1f
60 vld2.32 {d16-d17},[r7,:128],r12
61 vld2.32 {d0-d1}, [r2,:128]!
62 vrev64.32 d17, d17
63 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
64 vmul.f32 d6, d17, d2
65 vmul.f32 d7, d0, d2
66 vst2.32 {d4[0],d5[0]}, [r6,:64]
67 vst2.32 {d4[1],d5[1]}, [r8,:64]
68 b 1b
69 1:
70 vst2.32 {d4[0],d5[0]}, [r6,:64]
71 vst2.32 {d4[1],d5[1]}, [r8,:64]
72
73 mov r4, r0
74 mov r6, r1
75 bl ff_fft_calc_neon
76
77 mov r12, #1
78 ldr lr, [r4, #28] @ mdct_bits
79 ldr r4, [r4, #32] @ tcos
80 lsl r12, r12, lr @ n = 1 << nbits
81 lsr lr, r12, #3 @ n8 = n >> 3
82
83 add r4, r4, lr, lsl #3
84 add r6, r6, lr, lsl #3
85 sub r1, r4, #16
86 sub r3, r6, #16
87
88 mov r7, #-16
89 mov r8, r6
90 mov r0, r3
91
92 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
93 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
94 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
95 1:
96 subs lr, lr, #2
97 vmul.f32 d7, d0, d18
98 vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
99 vmul.f32 d4, d1, d18
100 vmul.f32 d5, d21, d19
101 vmul.f32 d6, d20, d19
102 vmul.f32 d22, d1, d16
103 vmul.f32 d23, d21, d17
104 vmul.f32 d24, d0, d16
105 vmul.f32 d25, d20, d17
106 vadd.f32 d7, d7, d22
107 vadd.f32 d6, d6, d23
108 vsub.f32 d4, d4, d24
109 vsub.f32 d5, d5, d25
110 beq 1f
111 vld2.32 {d0-d1}, [r3,:128], r7
112 vld2.32 {d20-d21},[r6,:128]!
113 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
114 vrev64.32 q3, q3
115 vst2.32 {d4,d6}, [r0,:128], r7
116 vst2.32 {d5,d7}, [r8,:128]!
117 b 1b
118 1:
119 vrev64.32 q3, q3
120 vst2.32 {d4,d6}, [r0,:128]
121 vst2.32 {d5,d7}, [r8,:128]
122
123 pop {r4-r8,pc}
124 .endfunc
125
126 function ff_imdct_calc_neon, export=1
127 push {r4-r6,lr}
128
129 ldr r3, [r0, #28]
130 mov r4, #1
131 mov r5, r1
132 lsl r4, r4, r3
133 add r1, r1, r4
134
135 bl ff_imdct_half_neon
136
137 add r0, r5, r4, lsl #2
138 add r1, r5, r4, lsl #1
139 sub r0, r0, #8
140 sub r2, r1, #16
141 mov r3, #-16
142 mov r6, #-8
143 vmov.i32 d30, #1<<31
144 1:
145 vld1.32 {d0-d1}, [r2,:128], r3
146 pld [r0, #-16]
147 vrev64.32 q0, q0
148 vld1.32 {d2-d3}, [r1,:128]!
149 veor d4, d1, d30
150 pld [r2, #-16]
151 vrev64.32 q1, q1
152 veor d5, d0, d30
153 vst1.32 {d2}, [r0,:64], r6
154 vst1.32 {d3}, [r0,:64], r6
155 vst1.32 {d4-d5}, [r5,:128]!
156 subs r4, r4, #16
157 bgt 1b
158
159 pop {r4-r6,pc}
160 .endfunc
161
162 function ff_mdct_calc_neon, export=1
163 push {r4-r10,lr}
164
165 mov r12, #1
166 ldr lr, [r0, #28] @ mdct_bits
167 ldr r4, [r0, #32] @ tcos
168 ldr r3, [r0, #8] @ revtab
169 lsl lr, r12, lr @ n = 1 << nbits
170 add r7, r2, lr @ in4u
171 sub r9, r7, #16 @ in4d
172 add r2, r7, lr, lsl #1 @ in3u
173 add r8, r9, lr, lsl #1 @ in3d
174 add r5, r4, lr, lsl #1
175 sub r5, r5, #16
176 sub r3, r3, #4
177 mov r12, #-16
178
179 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
180 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
181 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
182 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
183 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
184 vsub.f32 d0, d18, d0 @ in4d-in4u I
185 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
186 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
187 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
188 vadd.f32 d1, d1, d19 @ in3u+in3d -R
189 vsub.f32 d16, d16, d2 @ in0u-in2d R
190 vadd.f32 d17, d17, d3 @ in2u+in1d -I
191 1:
192 vmul.f32 d7, d0, d21 @ I*s
193 ldr r10, [r3, lr, lsr #1]
194 vmul.f32 d6, d1, d20 @ -R*c
195 ldr r6, [r3, #4]!
196 vmul.f32 d4, d1, d21 @ -R*s
197 vmul.f32 d5, d0, d20 @ I*c
198 vmul.f32 d24, d16, d30 @ R*c
199 vmul.f32 d25, d17, d31 @ -I*s
200 vmul.f32 d22, d16, d31 @ R*s
201 vmul.f32 d23, d17, d30 @ I*c
202 subs lr, lr, #16
203 vsub.f32 d6, d6, d7 @ -R*c-I*s
204 vadd.f32 d7, d4, d5 @ -R*s+I*c
205 vsub.f32 d24, d25, d24 @ I*s-R*c
206 vadd.f32 d25, d22, d23 @ R*s-I*c
207 beq 1f
208 mov r12, #-16
209 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
210 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
211 vneg.f32 d7, d7 @ R*s-I*c
212 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
213 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
214 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
215 vsub.f32 d0, d18, d0 @ in4d-in4u I
216 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
217 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
218 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
219 vadd.f32 d1, d1, d19 @ in3u+in3d -R
220 vsub.f32 d16, d16, d2 @ in0u-in2d R
221 vadd.f32 d17, d17, d3 @ in2u+in1d -I
222 uxth r12, r6, ror #16
223 uxth r6, r6
224 add r12, r1, r12, lsl #3
225 add r6, r1, r6, lsl #3
226 vst2.32 {d6[0],d7[0]}, [r6,:64]
227 vst2.32 {d6[1],d7[1]}, [r12,:64]
228 uxth r6, r10, ror #16
229 uxth r10, r10
230 add r6 , r1, r6, lsl #3
231 add r10, r1, r10, lsl #3
232 vst2.32 {d24[0],d25[0]},[r10,:64]
233 vst2.32 {d24[1],d25[1]},[r6,:64]
234 b 1b
235 1:
236 vneg.f32 d7, d7 @ R*s-I*c
237 uxth r12, r6, ror #16
238 uxth r6, r6
239 add r12, r1, r12, lsl #3
240 add r6, r1, r6, lsl #3
241 vst2.32 {d6[0],d7[0]}, [r6,:64]
242 vst2.32 {d6[1],d7[1]}, [r12,:64]
243 uxth r6, r10, ror #16
244 uxth r10, r10
245 add r6 , r1, r6, lsl #3
246 add r10, r1, r10, lsl #3
247 vst2.32 {d24[0],d25[0]},[r10,:64]
248 vst2.32 {d24[1],d25[1]},[r6,:64]
249
250 mov r4, r0
251 mov r6, r1
252 bl ff_fft_calc_neon
253
254 mov r12, #1
255 ldr lr, [r4, #28] @ mdct_bits
256 ldr r4, [r4, #32] @ tcos
257 lsl r12, r12, lr @ n = 1 << nbits
258 lsr lr, r12, #3 @ n8 = n >> 3
259
260 add r4, r4, lr, lsl #3
261 add r6, r6, lr, lsl #3
262 sub r1, r4, #16
263 sub r3, r6, #16
264
265 mov r7, #-16
266 mov r8, r6
267 mov r0, r3
268
269 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
270 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
271 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
272 1:
273 subs lr, lr, #2
274 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
275 vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
276 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
277 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
278 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
279 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
280 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
281 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
282 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
283 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
284 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
285 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
286 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
287 vneg.f32 q2, q2
288 beq 1f
289 vld2.32 {d0-d1}, [r3,:128], r7
290 vld2.32 {d20-d21},[r6,:128]!
291 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
292 vrev64.32 q3, q3
293 vst2.32 {d4,d6}, [r0,:128], r7
294 vst2.32 {d5,d7}, [r8,:128]!
295 b 1b
296 1:
297 vrev64.32 q3, q3
298 vst2.32 {d4,d6}, [r0,:128]
299 vst2.32 {d5,d7}, [r8,:128]
300
301 pop {r4-r10,pc}
302 .endfunc