ARM: merge two loops in ff_mdct_calc_neon
[libav.git] / libavcodec / arm / mdct_neon.S
1 /*
2 * ARM NEON optimised MDCT
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "asm.S"
23
24 .fpu neon
25 .text
26
27 function ff_imdct_half_neon, export=1
28 push {r4-r8,lr}
29
30 mov r12, #1
31 ldr lr, [r0, #28] @ mdct_bits
32 ldr r4, [r0, #32] @ tcos
33 ldr r3, [r0, #8] @ revtab
34 lsl r12, r12, lr @ n = 1 << nbits
35 lsr lr, r12, #2 @ n4 = n >> 2
36 add r7, r2, r12, lsl #1
37 mov r12, #-16
38 sub r7, r7, #16
39
40 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
41 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
42 vrev64.32 d17, d17
43 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
44 vmul.f32 d6, d17, d2
45 vmul.f32 d7, d0, d2
46 1:
47 subs lr, lr, #2
48 ldr r6, [r3], #4
49 vmul.f32 d4, d0, d3
50 vmul.f32 d5, d17, d3
51 vsub.f32 d4, d6, d4
52 vadd.f32 d5, d5, d7
53 uxth r8, r6, ror #16
54 uxth r6, r6
55 add r8, r1, r8, lsl #3
56 add r6, r1, r6, lsl #3
57 beq 1f
58 vld2.32 {d16-d17},[r7,:128],r12
59 vld2.32 {d0-d1}, [r2,:128]!
60 vrev64.32 d17, d17
61 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
62 vmul.f32 d6, d17, d2
63 vmul.f32 d7, d0, d2
64 vst2.32 {d4[0],d5[0]}, [r6,:64]
65 vst2.32 {d4[1],d5[1]}, [r8,:64]
66 b 1b
67 1:
68 vst2.32 {d4[0],d5[0]}, [r6,:64]
69 vst2.32 {d4[1],d5[1]}, [r8,:64]
70
71 mov r4, r0
72 mov r6, r1
73 bl ff_fft_calc_neon
74
75 mov r12, #1
76 ldr lr, [r4, #28] @ mdct_bits
77 ldr r4, [r4, #32] @ tcos
78 lsl r12, r12, lr @ n = 1 << nbits
79 lsr lr, r12, #3 @ n8 = n >> 3
80
81 add r4, r4, lr, lsl #3
82 add r6, r6, lr, lsl #3
83 sub r1, r4, #16
84 sub r3, r6, #16
85
86 mov r7, #-16
87 mov r8, r6
88 mov r0, r3
89
90 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
91 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
92 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
93 1:
94 subs lr, lr, #2
95 vmul.f32 d7, d0, d18
96 vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
97 vmul.f32 d4, d1, d18
98 vmul.f32 d5, d21, d19
99 vmul.f32 d6, d20, d19
100 vmul.f32 d22, d1, d16
101 vmul.f32 d23, d21, d17
102 vmul.f32 d24, d0, d16
103 vmul.f32 d25, d20, d17
104 vadd.f32 d7, d7, d22
105 vadd.f32 d6, d6, d23
106 vsub.f32 d4, d4, d24
107 vsub.f32 d5, d5, d25
108 beq 1f
109 vld2.32 {d0-d1}, [r3,:128], r7
110 vld2.32 {d20-d21},[r6,:128]!
111 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
112 vrev64.32 q3, q3
113 vst2.32 {d4,d6}, [r0,:128], r7
114 vst2.32 {d5,d7}, [r8,:128]!
115 b 1b
116 1:
117 vrev64.32 q3, q3
118 vst2.32 {d4,d6}, [r0,:128]
119 vst2.32 {d5,d7}, [r8,:128]
120
121 pop {r4-r8,pc}
122 .endfunc
123
124 function ff_imdct_calc_neon, export=1
125 push {r4-r6,lr}
126
127 ldr r3, [r0, #28]
128 mov r4, #1
129 mov r5, r1
130 lsl r4, r4, r3
131 add r1, r1, r4
132
133 bl ff_imdct_half_neon
134
135 add r0, r5, r4, lsl #2
136 add r1, r5, r4, lsl #1
137 sub r0, r0, #8
138 sub r2, r1, #16
139 mov r3, #-16
140 mov r6, #-8
141 vmov.i32 d30, #1<<31
142 1:
143 vld1.32 {d0-d1}, [r2,:128], r3
144 pld [r0, #-16]
145 vrev64.32 q0, q0
146 vld1.32 {d2-d3}, [r1,:128]!
147 veor d4, d1, d30
148 pld [r2, #-16]
149 vrev64.32 q1, q1
150 veor d5, d0, d30
151 vst1.32 {d2}, [r0,:64], r6
152 vst1.32 {d3}, [r0,:64], r6
153 vst1.32 {d4-d5}, [r5,:128]!
154 subs r4, r4, #16
155 bgt 1b
156
157 pop {r4-r6,pc}
158 .endfunc
159
160 function ff_mdct_calc_neon, export=1
161 push {r4-r10,lr}
162
163 mov r12, #1
164 ldr lr, [r0, #28] @ mdct_bits
165 ldr r4, [r0, #32] @ tcos
166 ldr r3, [r0, #8] @ revtab
167 lsl lr, r12, lr @ n = 1 << nbits
168 add r7, r2, lr @ in4u
169 sub r9, r7, #16 @ in4d
170 add r2, r7, lr, lsl #1 @ in3u
171 add r8, r9, lr, lsl #1 @ in3d
172 add r5, r4, lr, lsl #1
173 sub r5, r5, #16
174 sub r3, r3, #4
175 mov r12, #-16
176
177 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
178 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
179 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
180 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
181 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
182 vsub.f32 d0, d18, d0 @ in4d-in4u I
183 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
184 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
185 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
186 vadd.f32 d1, d1, d19 @ in3u+in3d -R
187 vsub.f32 d16, d16, d2 @ in0u-in2d R
188 vadd.f32 d17, d17, d3 @ in2u+in1d -I
189 1:
190 vmul.f32 d7, d0, d21 @ I*s
191 ldr r10, [r3, lr, lsr #1]
192 vmul.f32 d6, d1, d20 @ -R*c
193 ldr r6, [r3, #4]!
194 vmul.f32 d4, d1, d21 @ -R*s
195 vmul.f32 d5, d0, d20 @ I*c
196 vmul.f32 d24, d16, d30 @ R*c
197 vmul.f32 d25, d17, d31 @ -I*s
198 vmul.f32 d22, d16, d31 @ R*s
199 vmul.f32 d23, d17, d30 @ I*c
200 subs lr, lr, #16
201 vsub.f32 d6, d6, d7 @ -R*c-I*s
202 vadd.f32 d7, d4, d5 @ -R*s+I*c
203 vsub.f32 d24, d25, d24 @ I*s-R*c
204 vadd.f32 d25, d22, d23 @ R*s-I*c
205 beq 1f
206 mov r12, #-16
207 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
208 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
209 vneg.f32 d7, d7 @ R*s-I*c
210 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
211 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
212 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
213 vsub.f32 d0, d18, d0 @ in4d-in4u I
214 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
215 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
216 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
217 vadd.f32 d1, d1, d19 @ in3u+in3d -R
218 vsub.f32 d16, d16, d2 @ in0u-in2d R
219 vadd.f32 d17, d17, d3 @ in2u+in1d -I
220 uxth r12, r6, ror #16
221 uxth r6, r6
222 add r12, r1, r12, lsl #3
223 add r6, r1, r6, lsl #3
224 vst2.32 {d6[0],d7[0]}, [r6,:64]
225 vst2.32 {d6[1],d7[1]}, [r12,:64]
226 uxth r6, r10, ror #16
227 uxth r10, r10
228 add r6 , r1, r6, lsl #3
229 add r10, r1, r10, lsl #3
230 vst2.32 {d24[0],d25[0]},[r10,:64]
231 vst2.32 {d24[1],d25[1]},[r6,:64]
232 b 1b
233 1:
234 vneg.f32 d7, d7 @ R*s-I*c
235 uxth r12, r6, ror #16
236 uxth r6, r6
237 add r12, r1, r12, lsl #3
238 add r6, r1, r6, lsl #3
239 vst2.32 {d6[0],d7[0]}, [r6,:64]
240 vst2.32 {d6[1],d7[1]}, [r12,:64]
241 uxth r6, r10, ror #16
242 uxth r10, r10
243 add r6 , r1, r6, lsl #3
244 add r10, r1, r10, lsl #3
245 vst2.32 {d24[0],d25[0]},[r10,:64]
246 vst2.32 {d24[1],d25[1]},[r6,:64]
247
248 mov r4, r0
249 mov r6, r1
250 bl ff_fft_calc_neon
251
252 mov r12, #1
253 ldr lr, [r4, #28] @ mdct_bits
254 ldr r4, [r4, #32] @ tcos
255 lsl r12, r12, lr @ n = 1 << nbits
256 lsr lr, r12, #3 @ n8 = n >> 3
257
258 add r4, r4, lr, lsl #3
259 add r6, r6, lr, lsl #3
260 sub r1, r4, #16
261 sub r3, r6, #16
262
263 mov r7, #-16
264 mov r8, r6
265 mov r0, r3
266
267 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
268 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
269 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
270 1:
271 subs lr, lr, #2
272 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
273 vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
274 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
275 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
276 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
277 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
278 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
279 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
280 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
281 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
282 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
283 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
284 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
285 vneg.f32 q2, q2
286 beq 1f
287 vld2.32 {d0-d1}, [r3,:128], r7
288 vld2.32 {d20-d21},[r6,:128]!
289 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
290 vrev64.32 q3, q3
291 vst2.32 {d4,d6}, [r0,:128], r7
292 vst2.32 {d5,d7}, [r8,:128]!
293 b 1b
294 1:
295 vrev64.32 q3, q3
296 vst2.32 {d4,d6}, [r0,:128]
297 vst2.32 {d5,d7}, [r8,:128]
298
299 pop {r4-r10,pc}
300 .endfunc