ARM: interleave cos/sin tables for improved NEON MDCT
[libav.git] / libavcodec / arm / mdct_neon.S
CommitLineData
648d7920
MR
1/*
2 * ARM NEON optimised MDCT
3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "asm.S"
23
24 .fpu neon
25 .text
26
27function ff_imdct_half_neon, export=1
28 push {r4-r8,lr}
29
30 mov r12, #1
01b22147
MR
31 ldr lr, [r0, #28] @ mdct_bits
32 ldr r4, [r0, #32] @ tcos
01b22147 33 ldr r3, [r0, #8] @ revtab
648d7920
MR
34 lsl r12, r12, lr @ n = 1 << nbits
35 lsr lr, r12, #2 @ n4 = n >> 2
36 add r7, r2, r12, lsl #1
f7a3b603 37 mov r12, #-16
648d7920
MR
38 sub r7, r7, #16
39
750f5034
MR
40 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
41 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
42 vrev64.32 d17, d17
f7a3b603 43 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
750f5034 44 vmul.f32 d6, d17, d2
648d7920
MR
45 vmul.f32 d7, d0, d2
461:
47 subs lr, lr, #2
48 ldr r6, [r3], #4
49 vmul.f32 d4, d0, d3
750f5034 50 vmul.f32 d5, d17, d3
648d7920
MR
51 vsub.f32 d4, d6, d4
52 vadd.f32 d5, d5, d7
9ecc4141
MR
53 uxth r8, r6, ror #16
54 uxth r6, r6
55 add r8, r1, r8, lsl #3
56 add r6, r1, r6, lsl #3
648d7920 57 beq 1f
750f5034
MR
58 vld2.32 {d16-d17},[r7,:128],r12
59 vld2.32 {d0-d1}, [r2,:128]!
60 vrev64.32 d17, d17
f7a3b603 61 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
750f5034 62 vmul.f32 d6, d17, d2
648d7920
MR
63 vmul.f32 d7, d0, d2
64 vst2.32 {d4[0],d5[0]}, [r6,:64]
65 vst2.32 {d4[1],d5[1]}, [r8,:64]
66 b 1b
671:
68 vst2.32 {d4[0],d5[0]}, [r6,:64]
69 vst2.32 {d4[1],d5[1]}, [r8,:64]
70
71 mov r4, r0
72 mov r6, r1
648d7920
MR
73 bl ff_fft_calc_neon
74
75 mov r12, #1
01b22147 76 ldr lr, [r4, #28] @ mdct_bits
01b22147 77 ldr r4, [r4, #32] @ tcos
648d7920
MR
78 lsl r12, r12, lr @ n = 1 << nbits
79 lsr lr, r12, #3 @ n8 = n >> 3
80
f7a3b603 81 add r4, r4, lr, lsl #3
648d7920 82 add r6, r6, lr, lsl #3
f7a3b603 83 sub r1, r4, #16
648d7920
MR
84 sub r3, r6, #16
85
86 mov r7, #-16
648d7920
MR
87 mov r8, r6
88 mov r0, r3
89
750f5034
MR
90 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
91 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
f7a3b603 92 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
648d7920
MR
931:
94 subs lr, lr, #2
95 vmul.f32 d7, d0, d18
f7a3b603 96 vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
648d7920 97 vmul.f32 d4, d1, d18
648d7920 98 vmul.f32 d5, d21, d19
648d7920
MR
99 vmul.f32 d6, d20, d19
100 vmul.f32 d22, d1, d16
101 vmul.f32 d23, d21, d17
102 vmul.f32 d24, d0, d16
103 vmul.f32 d25, d20, d17
104 vadd.f32 d7, d7, d22
105 vadd.f32 d6, d6, d23
106 vsub.f32 d4, d4, d24
107 vsub.f32 d5, d5, d25
108 beq 1f
750f5034
MR
109 vld2.32 {d0-d1}, [r3,:128], r7
110 vld2.32 {d20-d21},[r6,:128]!
f7a3b603 111 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
648d7920 112 vrev64.32 q3, q3
750f5034
MR
113 vst2.32 {d4,d6}, [r0,:128], r7
114 vst2.32 {d5,d7}, [r8,:128]!
648d7920
MR
115 b 1b
1161:
117 vrev64.32 q3, q3
750f5034
MR
118 vst2.32 {d4,d6}, [r0,:128]
119 vst2.32 {d5,d7}, [r8,:128]
648d7920
MR
120
121 pop {r4-r8,pc}
122.endfunc
123
124function ff_imdct_calc_neon, export=1
125 push {r4-r6,lr}
126
01b22147 127 ldr r3, [r0, #28]
648d7920
MR
128 mov r4, #1
129 mov r5, r1
130 lsl r4, r4, r3
131 add r1, r1, r4
132
133 bl ff_imdct_half_neon
134
135 add r0, r5, r4, lsl #2
136 add r1, r5, r4, lsl #1
137 sub r0, r0, #8
138 sub r2, r1, #16
139 mov r3, #-16
140 mov r6, #-8
141 vmov.i32 d30, #1<<31
1421:
143 vld1.32 {d0-d1}, [r2,:128], r3
144 pld [r0, #-16]
145 vrev64.32 q0, q0
146 vld1.32 {d2-d3}, [r1,:128]!
147 veor d4, d1, d30
148 pld [r2, #-16]
149 vrev64.32 q1, q1
150 veor d5, d0, d30
151 vst1.32 {d2}, [r0,:64], r6
152 vst1.32 {d3}, [r0,:64], r6
153 vst1.32 {d4-d5}, [r5,:128]!
154 subs r4, r4, #16
155 bgt 1b
156
157 pop {r4-r6,pc}
158.endfunc
edc0f5dc
MR
159
160function ff_mdct_calc_neon, export=1
161 push {r4-r10,lr}
162
163 mov r12, #1
01b22147
MR
164 ldr lr, [r0, #28] @ mdct_bits
165 ldr r4, [r0, #32] @ tcos
01b22147 166 ldr r3, [r0, #8] @ revtab
edc0f5dc
MR
167 lsl lr, r12, lr @ n = 1 << nbits
168 add r7, r2, lr @ in4u
169 sub r9, r7, #16 @ in4d
170 add r2, r7, lr, lsl #1 @ in3u
171 add r8, r9, lr, lsl #1 @ in3d
172 mov r12, #-16
173
174 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
175 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
176 vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
177 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
178 vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
179 vsub.f32 d20, d18, d20 @ in4d-in4u I
f7a3b603 180 vld2.32 {d2,d3}, [r4,:128]! @ c0,c1 s0,s1
edc0f5dc 181 vadd.f32 d0, d0, d19 @ in3u+in3d -R
edc0f5dc
MR
1821:
183 vmul.f32 d7, d20, d3 @ I*s
184 vmul.f32 d6, d0, d2 @ -R*c
185 ldr r6, [r3], #4
186 vmul.f32 d4, d0, d3 @ -R*s
187 vmul.f32 d5, d20, d2 @ I*c
188 subs lr, lr, #16
189 vsub.f32 d6, d6, d7 @ -R*c-I*s
190 vadd.f32 d7, d4, d5 @ -R*s+I*c
9ecc4141
MR
191 uxth r10, r6, ror #16
192 uxth r6, r6
193 add r10, r1, r10, lsl #3
194 add r6, r1, r6, lsl #3
edc0f5dc
MR
195 beq 1f
196 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in4d1,in4d0
197 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in3d1,in3d0
198 vneg.f32 d7, d7 @ R*s-I*c
199 vld2.32 {d20,d21},[r7,:128]! @ in4u0,in4u1 x,x
200 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
201 vld2.32 {d0, d1}, [r2,:128]! @ in3u0,in3u1 x,x
202 vsub.f32 d20, d18, d20 @ in4d-in4u I
f7a3b603 203 vld2.32 {d2,d3}, [r4,:128]! @ c0,c1 s0,s1
edc0f5dc 204 vadd.f32 d0, d0, d19 @ in3u+in3d -R
edc0f5dc
MR
205 vst2.32 {d6[0],d7[0]}, [r6,:64]
206 vst2.32 {d6[1],d7[1]}, [r10,:64]
207 b 1b
2081:
209 vneg.f32 d7, d7 @ R*s-I*c
210 vst2.32 {d6[0],d7[0]}, [r6,:64]
211 vst2.32 {d6[1],d7[1]}, [r10,:64]
212
213 mov r12, #1
01b22147 214 ldr lr, [r0, #28] @ mdct_bits
edc0f5dc
MR
215 lsl lr, r12, lr @ n = 1 << nbits
216 sub r8, r2, #16 @ in1d
217 add r2, r9, #16 @ in0u
218 sub r9, r7, #16 @ in2d
219 mov r12, #-16
220
221 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
222 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
223 vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
224 vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1
225 vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
226 vsub.f32 d0, d0, d18 @ in0u-in2d R
f7a3b603 227 vld2.32 {d2,d3}, [r4,:128]! @ c0,c1 s0,s1
edc0f5dc 228 vadd.f32 d20, d20, d19 @ in2u+in1d -I
edc0f5dc
MR
2291:
230 vmul.f32 d6, d0, d2 @ R*c
231 vmul.f32 d7, d20, d3 @ -I*s
232 ldr r6, [r3], #4
233 vmul.f32 d4, d0, d3 @ R*s
234 vmul.f32 d5, d20, d2 @ I*c
235 subs lr, lr, #16
236 vsub.f32 d6, d7, d6 @ I*s-R*c
237 vadd.f32 d7, d4, d5 @ R*s-I*c
9ecc4141
MR
238 uxth r10, r6, ror #16
239 uxth r6, r6
240 add r10, r1, r10, lsl #3
241 add r6, r1, r6, lsl #3
edc0f5dc
MR
242 beq 1f
243 vld2.32 {d16,d18},[r9,:128],r12 @ x,x in2d1,in2d0
244 vld2.32 {d17,d19},[r8,:128],r12 @ x,x in1d1,in1d0
245 vld2.32 {d20,d21},[r7,:128]! @ in2u0,in2u1 x,x
246 vrev64.32 q9, q9 @ in2d0,in2d1 in1d0,in1d1
247 vld2.32 {d0, d1}, [r2,:128]! @ in0u0,in0u1 x,x
248 vsub.f32 d0, d0, d18 @ in0u-in2d R
f7a3b603 249 vld2.32 {d2,d3}, [r4,:128]! @ c0,c1 s0,s1
edc0f5dc 250 vadd.f32 d20, d20, d19 @ in2u+in1d -I
edc0f5dc
MR
251 vst2.32 {d6[0],d7[0]}, [r6,:64]
252 vst2.32 {d6[1],d7[1]}, [r10,:64]
253 b 1b
2541:
255 vst2.32 {d6[0],d7[0]}, [r6,:64]
256 vst2.32 {d6[1],d7[1]}, [r10,:64]
257
258 mov r4, r0
259 mov r6, r1
edc0f5dc
MR
260 bl ff_fft_calc_neon
261
262 mov r12, #1
01b22147 263 ldr lr, [r4, #28] @ mdct_bits
01b22147 264 ldr r4, [r4, #32] @ tcos
edc0f5dc
MR
265 lsl r12, r12, lr @ n = 1 << nbits
266 lsr lr, r12, #3 @ n8 = n >> 3
267
f7a3b603 268 add r4, r4, lr, lsl #3
edc0f5dc 269 add r6, r6, lr, lsl #3
f7a3b603 270 sub r1, r4, #16
edc0f5dc
MR
271 sub r3, r6, #16
272
273 mov r7, #-16
edc0f5dc
MR
274 mov r8, r6
275 mov r0, r3
276
277 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
278 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
f7a3b603 279 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
edc0f5dc
MR
2801:
281 subs lr, lr, #2
282 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
f7a3b603 283 vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
edc0f5dc 284 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
edc0f5dc 285 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
edc0f5dc
MR
286 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
287 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
288 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
289 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
290 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
291 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
292 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
293 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
294 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
295 vneg.f32 q2, q2
296 beq 1f
297 vld2.32 {d0-d1}, [r3,:128], r7
298 vld2.32 {d20-d21},[r6,:128]!
f7a3b603 299 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
edc0f5dc
MR
300 vrev64.32 q3, q3
301 vst2.32 {d4,d6}, [r0,:128], r7
302 vst2.32 {d5,d7}, [r8,:128]!
303 b 1b
3041:
305 vrev64.32 q3, q3
306 vst2.32 {d4,d6}, [r0,:128]
307 vst2.32 {d5,d7}, [r8,:128]
308
309 pop {r4-r10,pc}
310.endfunc