4c65f8ff9e52fda572e7d1596965505959c97872
[libav.git] / libavcodec / arm / mdct_neon.S
1 /*
2 * ARM NEON optimised MDCT
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "asm.S"
23
24 .fpu neon
25 .text
26
27 function ff_imdct_half_neon, export=1
28 push {r4-r8,lr}
29
30 mov r12, #1
31 ldr lr, [r0, #28] @ mdct_bits
32 ldr r4, [r0, #32] @ tcos
33 ldr r5, [r0, #36] @ tsin
34 ldr r3, [r0, #8] @ revtab
35 lsl r12, r12, lr @ n = 1 << nbits
36 lsr lr, r12, #2 @ n4 = n >> 2
37 add r7, r2, r12, lsl #1
38 mov r12, #-16
39 sub r7, r7, #16
40
41 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
42 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
43 vrev64.32 d17, d17
44 vld1.32 {d2}, [r4,:64]! @ d2=c0,c1
45 vmul.f32 d6, d17, d2
46 vld1.32 {d3}, [r5,:64]! @ d3=s0,s1
47 vmul.f32 d7, d0, d2
48 1:
49 subs lr, lr, #2
50 ldr r6, [r3], #4
51 vmul.f32 d4, d0, d3
52 vmul.f32 d5, d17, d3
53 vsub.f32 d4, d6, d4
54 vadd.f32 d5, d5, d7
55 uxth r8, r6, ror #16
56 uxth r6, r6
57 add r8, r1, r8, lsl #3
58 add r6, r1, r6, lsl #3
59 beq 1f
60 vld2.32 {d16-d17},[r7,:128],r12
61 vld2.32 {d0-d1}, [r2,:128]!
62 vrev64.32 d17, d17
63 vld1.32 {d2}, [r4,:64]!
64 vmul.f32 d6, d17, d2
65 vld1.32 {d3}, [r5,:64]!
66 vmul.f32 d7, d0, d2
67 vst2.32 {d4[0],d5[0]}, [r6,:64]
68 vst2.32 {d4[1],d5[1]}, [r8,:64]
69 b 1b
70 1:
71 vst2.32 {d4[0],d5[0]}, [r6,:64]
72 vst2.32 {d4[1],d5[1]}, [r8,:64]
73
74 mov r4, r0
75 mov r6, r1
76 bl ff_fft_calc_neon
77
78 mov r12, #1
79 ldr lr, [r4, #28] @ mdct_bits
80 ldr r5, [r4, #36] @ tsin
81 ldr r4, [r4, #32] @ tcos
82 lsl r12, r12, lr @ n = 1 << nbits
83 lsr lr, r12, #3 @ n8 = n >> 3
84
85 add r4, r4, lr, lsl #2
86 add r5, r5, lr, lsl #2
87 add r6, r6, lr, lsl #3
88 sub r1, r4, #8
89 sub r2, r5, #8
90 sub r3, r6, #16
91
92 mov r7, #-16
93 mov r12, #-8
94 mov r8, r6
95 mov r0, r3
96
97 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
98 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
99 vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
100 1:
101 subs lr, lr, #2
102 vmul.f32 d7, d0, d18
103 vld1.32 {d19}, [r5,:64]! @ d19=s2,s3
104 vmul.f32 d4, d1, d18
105 vld1.32 {d16}, [r1,:64], r12 @ d16=c1,c0
106 vmul.f32 d5, d21, d19
107 vld1.32 {d17}, [r4,:64]! @ d17=c2,c3
108 vmul.f32 d6, d20, d19
109 vmul.f32 d22, d1, d16
110 vmul.f32 d23, d21, d17
111 vmul.f32 d24, d0, d16
112 vmul.f32 d25, d20, d17
113 vadd.f32 d7, d7, d22
114 vadd.f32 d6, d6, d23
115 vsub.f32 d4, d4, d24
116 vsub.f32 d5, d5, d25
117 beq 1f
118 vld2.32 {d0-d1}, [r3,:128], r7
119 vld2.32 {d20-d21},[r6,:128]!
120 vld1.32 {d18}, [r2,:64], r12
121 vrev64.32 q3, q3
122 vst2.32 {d4,d6}, [r0,:128], r7
123 vst2.32 {d5,d7}, [r8,:128]!
124 b 1b
125 1:
126 vrev64.32 q3, q3
127 vst2.32 {d4,d6}, [r0,:128]
128 vst2.32 {d5,d7}, [r8,:128]
129
130 pop {r4-r8,pc}
131 .endfunc
132
133 function ff_imdct_calc_neon, export=1
134 push {r4-r6,lr}
135
136 ldr r3, [r0, #28]
137 mov r4, #1
138 mov r5, r1
139 lsl r4, r4, r3
140 add r1, r1, r4
141
142 bl ff_imdct_half_neon
143
144 add r0, r5, r4, lsl #2
145 add r1, r5, r4, lsl #1
146 sub r0, r0, #8
147 sub r2, r1, #16
148 mov r3, #-16
149 mov r6, #-8
150 vmov.i32 d30, #1<<31
151 1:
152 vld1.32 {d0-d1}, [r2,:128], r3
153 pld [r0, #-16]
154 vrev64.32 q0, q0
155 vld1.32 {d2-d3}, [r1,:128]!
156 veor d4, d1, d30
157 pld [r2, #-16]
158 vrev64.32 q1, q1
159 veor d5, d0, d30
160 vst1.32 {d2}, [r0,:64], r6
161 vst1.32 {d3}, [r0,:64], r6
162 vst1.32 {d4-d5}, [r5,:128]!
163 subs r4, r4, #16
164 bgt 1b
165
166 pop {r4-r6,pc}
167 .endfunc
168
169 function ff_mdct_calc_neon, export=1
170 push {r4-r10,lr}
171
172 mov r12, #1
173 ldr lr, [r0, #28] @ mdct_bits
174 ldr r4, [r0, #32] @ tcos
175 ldr r5, [r0, #36] @ tsin
176 ldr r3, [r0, #8] @ revtab
177 lsl lr, r12, lr @ n = 1 << nbits
178 add r7, r2, lr @ in4u
179 sub r9, r7, #16 @ in4d
180 add r2, r7, lr, lsl #1 @ in3u
181 add r8, r9, lr, lsl #1 @ in3d
182 mov r12, #-16
183
184 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
185 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
186 vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
187 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
188 vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
189 vsub.f32 d20, d18, d20 @ in4d-in4u I
190 vld1.32 {d2}, [r4,:64]! @ c0,c1
191 vadd.f32 d0, d0, d19 @ in3u+in3d -R
192 vld1.32 {d3}, [r5,:64]! @ s0,s1
193 1:
194 vmul.f32 d7, d20, d3 @ I*s
195 vmul.f32 d6, d0, d2 @ -R*c
196 ldr r6, [r3], #4
197 vmul.f32 d4, d0, d3 @ -R*s
198 vmul.f32 d5, d20, d2 @ I*c
199 subs lr, lr, #16
200 vsub.f32 d6, d6, d7 @ -R*c-I*s
201 vadd.f32 d7, d4, d5 @ -R*s+I*c
202 uxth r10, r6, ror #16
203 uxth r6, r6
204 add r10, r1, r10, lsl #3
205 add r6, r1, r6, lsl #3
206 beq 1f
207 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
208 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
209 vneg.f32 d7, d7 @ R*s-I*c
210 vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
211 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
212 vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
213 vsub.f32 d20, d18, d20 @ in4d-in4u I
214 vld1.32 {d2}, [r4,:64]! @ c0,c1
215 vadd.f32 d0, d0, d19 @ in3u+in3d -R
216 vld1.32 {d3}, [r5,:64]! @ s0,s1
217 vst2.32 {d6[0],d7[0]}, [r6,:64]
218 vst2.32 {d6[1],d7[1]}, [r10,:64]
219 b 1b
220 1:
221 vneg.f32 d7, d7 @ R*s-I*c
222 vst2.32 {d6[0],d7[0]}, [r6,:64]
223 vst2.32 {d6[1],d7[1]}, [r10,:64]
224
225 mov r12, #1
226 ldr lr, [r0, #28] @ mdct_bits
227 lsl lr, r12, lr @ n = 1 << nbits
228 sub r8, r2, #16 @ in1d
229 add r2, r9, #16 @ in0u
230 sub r9, r7, #16 @ in2d
231 mov r12, #-16
232
233 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
234 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
235 vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
236 vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1
237 vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
238 vsub.f32 d0, d0, d18 @ in0u-in2d R
239 vld1.32 {d2}, [r4,:64]! @ c0,c1
240 vadd.f32 d20, d20, d19 @ in2u+in1d -I
241 vld1.32 {d3}, [r5,:64]! @ s0,s1
242 1:
243 vmul.f32 d6, d0, d2 @ R*c
244 vmul.f32 d7, d20, d3 @ -I*s
245 ldr r6, [r3], #4
246 vmul.f32 d4, d0, d3 @ R*s
247 vmul.f32 d5, d20, d2 @ I*c
248 subs lr, lr, #16
249 vsub.f32 d6, d7, d6 @ I*s-R*c
250 vadd.f32 d7, d4, d5 @ R*s-I*c
251 uxth r10, r6, ror #16
252 uxth r6, r6
253 add r10, r1, r10, lsl #3
254 add r6, r1, r6, lsl #3
255 beq 1f
256 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
257 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
258 vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
259 vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1
260 vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
261 vsub.f32 d0, d0, d18 @ in0u-in2d R
262 vld1.32 {d2}, [r4,:64]! @ c0,c1
263 vadd.f32 d20, d20, d19 @ in2u+in1d -I
264 vld1.32 {d3}, [r5,:64]! @ s0,s1
265 vst2.32 {d6[0],d7[0]}, [r6,:64]
266 vst2.32 {d6[1],d7[1]}, [r10,:64]
267 b 1b
268 1:
269 vst2.32 {d6[0],d7[0]}, [r6,:64]
270 vst2.32 {d6[1],d7[1]}, [r10,:64]
271
272 mov r4, r0
273 mov r6, r1
274 bl ff_fft_calc_neon
275
276 mov r12, #1
277 ldr lr, [r4, #28] @ mdct_bits
278 ldr r5, [r4, #36] @ tsin
279 ldr r4, [r4, #32] @ tcos
280 lsl r12, r12, lr @ n = 1 << nbits
281 lsr lr, r12, #3 @ n8 = n >> 3
282
283 add r4, r4, lr, lsl #2
284 add r5, r5, lr, lsl #2
285 add r6, r6, lr, lsl #3
286 sub r1, r4, #8
287 sub r2, r5, #8
288 sub r3, r6, #16
289
290 mov r7, #-16
291 mov r12, #-8
292 mov r8, r6
293 mov r0, r3
294
295 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
296 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
297 vld1.32 {d18}, [r2,:64], r12 @ d18=s1,s0
298 1:
299 subs lr, lr, #2
300 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
301 vld1.32 {d19}, [r5,:64]! @ s2,s3
302 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
303 vld1.32 {d16}, [r1,:64], r12 @ c1,c0
304 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
305 vld1.32 {d17}, [r4,:64]! @ c2,c3
306 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
307 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
308 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
309 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
310 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
311 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
312 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
313 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
314 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
315 vneg.f32 q2, q2
316 beq 1f
317 vld2.32 {d0-d1}, [r3,:128], r7
318 vld2.32 {d20-d21},[r6,:128]!
319 vld1.32 {d18}, [r2,:64], r12
320 vrev64.32 q3, q3
321 vst2.32 {d4,d6}, [r0,:128], r7
322 vst2.32 {d5,d7}, [r8,:128]!
323 b 1b
324 1:
325 vrev64.32 q3, q3
326 vst2.32 {d4,d6}, [r0,:128]
327 vst2.32 {d5,d7}, [r8,:128]
328
329 pop {r4-r10,pc}
330 .endfunc