ARM: NEON optimised MDCT
[libav.git] / libavcodec / arm / mdct_neon.S
1 /*
2 * ARM NEON optimised MDCT
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "asm.S"
23
24 .fpu neon
25 .text
26
27 function ff_imdct_half_neon, export=1
28 push {r4-r8,lr}
29
30 mov r12, #1
31 ldr lr, [r0, #4] @ nbits
32 ldr r4, [r0, #8] @ tcos
33 ldr r5, [r0, #12] @ tsin
34 ldr r3, [r0, #24] @ revtab
35 lsl r12, r12, lr @ n = 1 << nbits
36 lsr lr, r12, #2 @ n4 = n >> 2
37 add r7, r2, r12, lsl #1
38 mov r12, #-16
39 sub r7, r7, #16
40
41 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
42 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
43 vrev64.32 d17, d17
44 vld1.32 {d2}, [r4,:64]! @ d2=c0,c1
45 vmul.f32 d6, d17, d2
46 vld1.32 {d3}, [r5,:64]! @ d3=s0,s1
47 vmul.f32 d7, d0, d2
48 1:
49 subs lr, lr, #2
50 ldr r6, [r3], #4
51 vmul.f32 d4, d0, d3
52 vmul.f32 d5, d17, d3
53 vsub.f32 d4, d6, d4
54 vadd.f32 d5, d5, d7
55 uxtah r8, r1, r6, ror #16
56 uxtah r6, r1, r6
57 beq 1f
58 vld2.32 {d16-d17},[r7,:128],r12
59 vld2.32 {d0-d1}, [r2,:128]!
60 vrev64.32 d17, d17
61 vld1.32 {d2}, [r4,:64]!
62 vmul.f32 d6, d17, d2
63 vld1.32 {d3}, [r5,:64]!
64 vmul.f32 d7, d0, d2
65 vst2.32 {d4[0],d5[0]}, [r6,:64]
66 vst2.32 {d4[1],d5[1]}, [r8,:64]
67 b 1b
68 1:
69 vst2.32 {d4[0],d5[0]}, [r6,:64]
70 vst2.32 {d4[1],d5[1]}, [r8,:64]
71
72 mov r4, r0
73 mov r6, r1
74 add r0, r0, #16
75 bl ff_fft_calc_neon
76
77 mov r12, #1
78 ldr lr, [r4, #4] @ nbits
79 ldr r5, [r4, #12] @ tsin
80 ldr r4, [r4, #8] @ tcos
81 lsl r12, r12, lr @ n = 1 << nbits
82 lsr lr, r12, #3 @ n8 = n >> 3
83
84 add r4, r4, lr, lsl #2
85 add r5, r5, lr, lsl #2
86 add r6, r6, lr, lsl #3
87 sub r1, r4, #8
88 sub r2, r5, #8
89 sub r3, r6, #16
90
91 mov r7, #-16
92 mov r12, #-8
93 mov r8, r6
94 mov r0, r3
95
96 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
97 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
98 vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
99 1:
100 subs lr, lr, #2
101 vmul.f32 d7, d0, d18
102 vld1.32 {d19}, [r5,:64]! @ d19=s2,s3
103 vmul.f32 d4, d1, d18
104 vld1.32 {d16}, [r1,:64], r12 @ d16=c1,c0
105 vmul.f32 d5, d21, d19
106 vld1.32 {d17}, [r4,:64]! @ d17=c2,c3
107 vmul.f32 d6, d20, d19
108 vmul.f32 d22, d1, d16
109 vmul.f32 d23, d21, d17
110 vmul.f32 d24, d0, d16
111 vmul.f32 d25, d20, d17
112 vadd.f32 d7, d7, d22
113 vadd.f32 d6, d6, d23
114 vsub.f32 d4, d4, d24
115 vsub.f32 d5, d5, d25
116 beq 1f
117 vld2.32 {d0-d1}, [r3,:128], r7
118 vld2.32 {d20-d21},[r6,:128]!
119 vld1.32 {d18}, [r2,:64], r12
120 vrev64.32 q3, q3
121 vst2.32 {d4,d6}, [r0,:128], r7
122 vst2.32 {d5,d7}, [r8,:128]!
123 b 1b
124 1:
125 vrev64.32 q3, q3
126 vst2.32 {d4,d6}, [r0,:128]
127 vst2.32 {d5,d7}, [r8,:128]
128
129 pop {r4-r8,pc}
130 .endfunc
131
132 function ff_imdct_calc_neon, export=1
133 push {r4-r6,lr}
134
135 ldr r3, [r0, #4]
136 mov r4, #1
137 mov r5, r1
138 lsl r4, r4, r3
139 add r1, r1, r4
140
141 bl ff_imdct_half_neon
142
143 add r0, r5, r4, lsl #2
144 add r1, r5, r4, lsl #1
145 sub r0, r0, #8
146 sub r2, r1, #16
147 mov r3, #-16
148 mov r6, #-8
149 vmov.i32 d30, #1<<31
150 1:
151 vld1.32 {d0-d1}, [r2,:128], r3
152 pld [r0, #-16]
153 vrev64.32 q0, q0
154 vld1.32 {d2-d3}, [r1,:128]!
155 veor d4, d1, d30
156 pld [r2, #-16]
157 vrev64.32 q1, q1
158 veor d5, d0, d30
159 vst1.32 {d2}, [r0,:64], r6
160 vst1.32 {d3}, [r0,:64], r6
161 vst1.32 {d4-d5}, [r5,:128]!
162 subs r4, r4, #16
163 bgt 1b
164
165 pop {r4-r6,pc}
166 .endfunc
167
168 function ff_mdct_calc_neon, export=1
169 push {r4-r10,lr}
170
171 mov r12, #1
172 ldr lr, [r0, #4] @ nbits
173 ldr r4, [r0, #8] @ tcos
174 ldr r5, [r0, #12] @ tsin
175 ldr r3, [r0, #24] @ revtab
176 lsl lr, r12, lr @ n = 1 << nbits
177 add r7, r2, lr @ in4u
178 sub r9, r7, #16 @ in4d
179 add r2, r7, lr, lsl #1 @ in3u
180 add r8, r9, lr, lsl #1 @ in3d
181 mov r12, #-16
182
183 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
184 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
185 vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
186 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
187 vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
188 vsub.f32 d20, d18, d20 @ in4d-in4u I
189 vld1.32 {d2}, [r4,:64]! @ c0,c1
190 vadd.f32 d0, d0, d19 @ in3u+in3d -R
191 vld1.32 {d3}, [r5,:64]! @ s0,s1
192 1:
193 vmul.f32 d7, d20, d3 @ I*s
194 vmul.f32 d6, d0, d2 @ -R*c
195 ldr r6, [r3], #4
196 vmul.f32 d4, d0, d3 @ -R*s
197 vmul.f32 d5, d20, d2 @ I*c
198 subs lr, lr, #16
199 vsub.f32 d6, d6, d7 @ -R*c-I*s
200 vadd.f32 d7, d4, d5 @ -R*s+I*c
201 uxtah r10, r1, r6, ror #16
202 uxtah r6, r1, r6
203 beq 1f
204 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
205 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
206 vneg.f32 d7, d7 @ R*s-I*c
207 vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
208 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
209 vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
210 vsub.f32 d20, d18, d20 @ in4d-in4u I
211 vld1.32 {d2}, [r4,:64]! @ c0,c1
212 vadd.f32 d0, d0, d19 @ in3u+in3d -R
213 vld1.32 {d3}, [r5,:64]! @ s0,s1
214 vst2.32 {d6[0],d7[0]}, [r6,:64]
215 vst2.32 {d6[1],d7[1]}, [r10,:64]
216 b 1b
217 1:
218 vneg.f32 d7, d7 @ R*s-I*c
219 vst2.32 {d6[0],d7[0]}, [r6,:64]
220 vst2.32 {d6[1],d7[1]}, [r10,:64]
221
222 mov r12, #1
223 ldr lr, [r0, #4] @ nbits
224 lsl lr, r12, lr @ n = 1 << nbits
225 sub r8, r2, #16 @ in1d
226 add r2, r9, #16 @ in0u
227 sub r9, r7, #16 @ in2d
228 mov r12, #-16
229
230 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
231 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
232 vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
233 vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1
234 vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
235 vsub.f32 d0, d0, d18 @ in0u-in2d R
236 vld1.32 {d2}, [r4,:64]! @ c0,c1
237 vadd.f32 d20, d20, d19 @ in2u+in1d -I
238 vld1.32 {d3}, [r5,:64]! @ s0,s1
239 1:
240 vmul.f32 d6, d0, d2 @ R*c
241 vmul.f32 d7, d20, d3 @ -I*s
242 ldr r6, [r3], #4
243 vmul.f32 d4, d0, d3 @ R*s
244 vmul.f32 d5, d20, d2 @ I*c
245 subs lr, lr, #16
246 vsub.f32 d6, d7, d6 @ I*s-R*c
247 vadd.f32 d7, d4, d5 @ R*s-I*c
248 uxtah r10, r1, r6, ror #16
249 uxtah r6, r1, r6
250 beq 1f
251 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
252 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
253 vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
254 vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1
255 vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
256 vsub.f32 d0, d0, d18 @ in0u-in2d R
257 vld1.32 {d2}, [r4,:64]! @ c0,c1
258 vadd.f32 d20, d20, d19 @ in2u+in1d -I
259 vld1.32 {d3}, [r5,:64]! @ s0,s1
260 vst2.32 {d6[0],d7[0]}, [r6,:64]
261 vst2.32 {d6[1],d7[1]}, [r10,:64]
262 b 1b
263 1:
264 vst2.32 {d6[0],d7[0]}, [r6,:64]
265 vst2.32 {d6[1],d7[1]}, [r10,:64]
266
267 mov r4, r0
268 mov r6, r1
269 add r0, r0, #16
270 bl ff_fft_calc_neon
271
272 mov r12, #1
273 ldr lr, [r4, #4] @ nbits
274 ldr r5, [r4, #12] @ tsin
275 ldr r4, [r4, #8] @ tcos
276 lsl r12, r12, lr @ n = 1 << nbits
277 lsr lr, r12, #3 @ n8 = n >> 3
278
279 add r4, r4, lr, lsl #2
280 add r5, r5, lr, lsl #2
281 add r6, r6, lr, lsl #3
282 sub r1, r4, #8
283 sub r2, r5, #8
284 sub r3, r6, #16
285
286 mov r7, #-16
287 mov r12, #-8
288 mov r8, r6
289 mov r0, r3
290
291 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
292 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
293 vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
294 1:
295 subs lr, lr, #2
296 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
297 vld1.32 {d19}, [r5,:64]! @ s2,s3
298 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
299 vld1.32 {d16}, [r1,:64], r12 @ c1,c0
300 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
301 vld1.32 {d17}, [r4,:64]! @ c2,c3
302 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
303 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
304 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
305 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
306 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
307 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
308 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
309 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
310 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
311 vneg.f32 q2, q2
312 beq 1f
313 vld2.32 {d0-d1}, [r3,:128], r7
314 vld2.32 {d20-d21},[r6,:128]!
315 vld1.32 {d18}, [r2,:64], r12
316 vrev64.32 q3, q3
317 vst2.32 {d4,d6}, [r0,:128], r7
318 vst2.32 {d5,d7}, [r8,:128]!
319 b 1b
320 1:
321 vrev64.32 q3, q3
322 vst2.32 {d4,d6}, [r0,:128]
323 vst2.32 {d5,d7}, [r8,:128]
324
325 pop {r4-r10,pc}
326 .endfunc