Commit | Line | Data |
---|---|---|
569f5a75 MR |
1 | /* |
2 | * ARM NEON optimised DSP functions | |
3 | * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
4 | * | |
5 | * This file is part of FFmpeg. | |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU Lesser General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2.1 of the License, or (at your option) any later version. | |
11 | * | |
12 | * FFmpeg is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
18 | * License along with FFmpeg; if not, write to the Free Software | |
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | */ | |
21 | ||
e814015d | 22 | #include "config.h" |
569f5a75 MR |
23 | #include "asm.S" |
24 | ||
25 | preserve8 | |
569f5a75 MR |
26 | .text |
27 | ||
73404a44 MR |
28 | function ff_clear_block_neon, export=1 |
29 | vmov.i16 q0, #0 | |
30 | .rept 8 | |
31 | vst1.16 {q0}, [r0,:128]! | |
32 | .endr | |
33 | bx lr | |
34 | endfunc | |
35 | ||
36 | function ff_clear_blocks_neon, export=1 | |
37 | vmov.i16 q0, #0 | |
38 | .rept 8*6 | |
39 | vst1.16 {q0}, [r0,:128]! | |
40 | .endr | |
41 | bx lr | |
42 | endfunc | |
43 | ||
569f5a75 MR |
44 | .macro pixels16 avg=0 |
45 | .if \avg | |
46 | mov ip, r0 | |
47 | .endif | |
48 | 1: vld1.64 {d0, d1}, [r1], r2 | |
49 | vld1.64 {d2, d3}, [r1], r2 | |
50 | vld1.64 {d4, d5}, [r1], r2 | |
51 | pld [r1, r2, lsl #2] | |
52 | vld1.64 {d6, d7}, [r1], r2 | |
53 | pld [r1] | |
54 | pld [r1, r2] | |
55 | pld [r1, r2, lsl #1] | |
56 | .if \avg | |
d8f3f340 | 57 | vld1.64 {d16,d17}, [ip,:128], r2 |
569f5a75 | 58 | vrhadd.u8 q0, q0, q8 |
d8f3f340 | 59 | vld1.64 {d18,d19}, [ip,:128], r2 |
569f5a75 | 60 | vrhadd.u8 q1, q1, q9 |
d8f3f340 | 61 | vld1.64 {d20,d21}, [ip,:128], r2 |
569f5a75 | 62 | vrhadd.u8 q2, q2, q10 |
d8f3f340 | 63 | vld1.64 {d22,d23}, [ip,:128], r2 |
569f5a75 MR |
64 | vrhadd.u8 q3, q3, q11 |
65 | .endif | |
66 | subs r3, r3, #4 | |
67 | vst1.64 {d0, d1}, [r0,:128], r2 | |
68 | vst1.64 {d2, d3}, [r0,:128], r2 | |
69 | vst1.64 {d4, d5}, [r0,:128], r2 | |
70 | vst1.64 {d6, d7}, [r0,:128], r2 | |
71 | bne 1b | |
72 | bx lr | |
73 | .endm | |
74 | ||
75 | .macro pixels16_x2 vhadd=vrhadd.u8 | |
76 | 1: vld1.64 {d0-d2}, [r1], r2 | |
77 | vld1.64 {d4-d6}, [r1], r2 | |
78 | pld [r1] | |
79 | pld [r1, r2] | |
80 | subs r3, r3, #2 | |
81 | vext.8 q1, q0, q1, #1 | |
82 | \vhadd q0, q0, q1 | |
83 | vext.8 q3, q2, q3, #1 | |
84 | \vhadd q2, q2, q3 | |
85 | vst1.64 {d0, d1}, [r0,:128], r2 | |
86 | vst1.64 {d4, d5}, [r0,:128], r2 | |
87 | bne 1b | |
88 | bx lr | |
89 | .endm | |
90 | ||
91 | .macro pixels16_y2 vhadd=vrhadd.u8 | |
d647ed78 DC |
92 | vld1.64 {d0, d1}, [r1], r2 |
93 | vld1.64 {d2, d3}, [r1], r2 | |
569f5a75 MR |
94 | 1: subs r3, r3, #2 |
95 | \vhadd q2, q0, q1 | |
d647ed78 | 96 | vld1.64 {d0, d1}, [r1], r2 |
569f5a75 | 97 | \vhadd q3, q0, q1 |
d647ed78 | 98 | vld1.64 {d2, d3}, [r1], r2 |
569f5a75 | 99 | pld [r1] |
d647ed78 | 100 | pld [r1, r2] |
569f5a75 MR |
101 | vst1.64 {d4, d5}, [r0,:128], r2 |
102 | vst1.64 {d6, d7}, [r0,:128], r2 | |
103 | bne 1b | |
d647ed78 | 104 | bx lr |
569f5a75 MR |
105 | .endm |
106 | ||
107 | .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
d647ed78 DC |
108 | vld1.64 {d0-d2}, [r1], r2 |
109 | vld1.64 {d4-d6}, [r1], r2 | |
569f5a75 MR |
110 | .if \no_rnd |
111 | vmov.i16 q13, #1 | |
112 | .endif | |
113 | pld [r1] | |
d647ed78 | 114 | pld [r1, r2] |
569f5a75 MR |
115 | vext.8 q1, q0, q1, #1 |
116 | vext.8 q3, q2, q3, #1 | |
117 | vaddl.u8 q8, d0, d2 | |
118 | vaddl.u8 q10, d1, d3 | |
119 | vaddl.u8 q9, d4, d6 | |
120 | vaddl.u8 q11, d5, d7 | |
121 | 1: subs r3, r3, #2 | |
d647ed78 | 122 | vld1.64 {d0-d2}, [r1], r2 |
569f5a75 MR |
123 | vadd.u16 q12, q8, q9 |
124 | pld [r1] | |
125 | .if \no_rnd | |
126 | vadd.u16 q12, q12, q13 | |
127 | .endif | |
128 | vext.8 q15, q0, q1, #1 | |
129 | vadd.u16 q1 , q10, q11 | |
130 | \vshrn d28, q12, #2 | |
131 | .if \no_rnd | |
132 | vadd.u16 q1, q1, q13 | |
133 | .endif | |
134 | \vshrn d29, q1, #2 | |
135 | vaddl.u8 q8, d0, d30 | |
d647ed78 | 136 | vld1.64 {d2-d4}, [r1], r2 |
569f5a75 MR |
137 | vaddl.u8 q10, d1, d31 |
138 | vst1.64 {d28,d29}, [r0,:128], r2 | |
139 | vadd.u16 q12, q8, q9 | |
d647ed78 | 140 | pld [r1, r2] |
569f5a75 MR |
141 | .if \no_rnd |
142 | vadd.u16 q12, q12, q13 | |
143 | .endif | |
144 | vext.8 q2, q1, q2, #1 | |
145 | vadd.u16 q0, q10, q11 | |
146 | \vshrn d30, q12, #2 | |
147 | .if \no_rnd | |
148 | vadd.u16 q0, q0, q13 | |
149 | .endif | |
150 | \vshrn d31, q0, #2 | |
151 | vaddl.u8 q9, d2, d4 | |
152 | vaddl.u8 q11, d3, d5 | |
153 | vst1.64 {d30,d31}, [r0,:128], r2 | |
154 | bgt 1b | |
d647ed78 | 155 | bx lr |
569f5a75 MR |
156 | .endm |
157 | ||
bef966e3 | 158 | .macro pixels8 avg=0 |
569f5a75 MR |
159 | 1: vld1.64 {d0}, [r1], r2 |
160 | vld1.64 {d1}, [r1], r2 | |
161 | vld1.64 {d2}, [r1], r2 | |
162 | pld [r1, r2, lsl #2] | |
163 | vld1.64 {d3}, [r1], r2 | |
164 | pld [r1] | |
165 | pld [r1, r2] | |
166 | pld [r1, r2, lsl #1] | |
bef966e3 MR |
167 | .if \avg |
168 | vld1.64 {d4}, [r0,:64], r2 | |
169 | vrhadd.u8 d0, d0, d4 | |
170 | vld1.64 {d5}, [r0,:64], r2 | |
171 | vrhadd.u8 d1, d1, d5 | |
172 | vld1.64 {d6}, [r0,:64], r2 | |
173 | vrhadd.u8 d2, d2, d6 | |
174 | vld1.64 {d7}, [r0,:64], r2 | |
175 | vrhadd.u8 d3, d3, d7 | |
176 | sub r0, r0, r2, lsl #2 | |
177 | .endif | |
569f5a75 MR |
178 | subs r3, r3, #4 |
179 | vst1.64 {d0}, [r0,:64], r2 | |
180 | vst1.64 {d1}, [r0,:64], r2 | |
181 | vst1.64 {d2}, [r0,:64], r2 | |
182 | vst1.64 {d3}, [r0,:64], r2 | |
183 | bne 1b | |
184 | bx lr | |
185 | .endm | |
186 | ||
187 | .macro pixels8_x2 vhadd=vrhadd.u8 | |
188 | 1: vld1.64 {d0, d1}, [r1], r2 | |
189 | vext.8 d1, d0, d1, #1 | |
190 | vld1.64 {d2, d3}, [r1], r2 | |
191 | vext.8 d3, d2, d3, #1 | |
192 | pld [r1] | |
193 | pld [r1, r2] | |
194 | subs r3, r3, #2 | |
195 | vswp d1, d2 | |
196 | \vhadd q0, q0, q1 | |
197 | vst1.64 {d0}, [r0,:64], r2 | |
198 | vst1.64 {d1}, [r0,:64], r2 | |
199 | bne 1b | |
200 | bx lr | |
201 | .endm | |
202 | ||
203 | .macro pixels8_y2 vhadd=vrhadd.u8 | |
d647ed78 DC |
204 | vld1.64 {d0}, [r1], r2 |
205 | vld1.64 {d1}, [r1], r2 | |
569f5a75 MR |
206 | 1: subs r3, r3, #2 |
207 | \vhadd d4, d0, d1 | |
d647ed78 | 208 | vld1.64 {d0}, [r1], r2 |
569f5a75 | 209 | \vhadd d5, d0, d1 |
d647ed78 | 210 | vld1.64 {d1}, [r1], r2 |
569f5a75 | 211 | pld [r1] |
d647ed78 | 212 | pld [r1, r2] |
569f5a75 MR |
213 | vst1.64 {d4}, [r0,:64], r2 |
214 | vst1.64 {d5}, [r0,:64], r2 | |
215 | bne 1b | |
d647ed78 | 216 | bx lr |
569f5a75 MR |
217 | .endm |
218 | ||
219 | .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
d647ed78 DC |
220 | vld1.64 {d0, d1}, [r1], r2 |
221 | vld1.64 {d2, d3}, [r1], r2 | |
569f5a75 MR |
222 | .if \no_rnd |
223 | vmov.i16 q11, #1 | |
224 | .endif | |
225 | pld [r1] | |
d647ed78 | 226 | pld [r1, r2] |
569f5a75 MR |
227 | vext.8 d4, d0, d1, #1 |
228 | vext.8 d6, d2, d3, #1 | |
229 | vaddl.u8 q8, d0, d4 | |
230 | vaddl.u8 q9, d2, d6 | |
231 | 1: subs r3, r3, #2 | |
d647ed78 | 232 | vld1.64 {d0, d1}, [r1], r2 |
569f5a75 MR |
233 | pld [r1] |
234 | vadd.u16 q10, q8, q9 | |
235 | vext.8 d4, d0, d1, #1 | |
236 | .if \no_rnd | |
237 | vadd.u16 q10, q10, q11 | |
238 | .endif | |
239 | vaddl.u8 q8, d0, d4 | |
240 | \vshrn d5, q10, #2 | |
d647ed78 | 241 | vld1.64 {d2, d3}, [r1], r2 |
569f5a75 | 242 | vadd.u16 q10, q8, q9 |
d647ed78 | 243 | pld [r1, r2] |
569f5a75 MR |
244 | .if \no_rnd |
245 | vadd.u16 q10, q10, q11 | |
246 | .endif | |
247 | vst1.64 {d5}, [r0,:64], r2 | |
248 | \vshrn d7, q10, #2 | |
249 | vext.8 d6, d2, d3, #1 | |
250 | vaddl.u8 q9, d2, d6 | |
251 | vst1.64 {d7}, [r0,:64], r2 | |
252 | bgt 1b | |
d647ed78 | 253 | bx lr |
569f5a75 MR |
254 | .endm |
255 | ||
256 | .macro pixfunc pfx name suf rnd_op args:vararg | |
257 | function ff_\pfx\name\suf\()_neon, export=1 | |
258 | \name \rnd_op \args | |
a7e7d40c | 259 | endfunc |
569f5a75 MR |
260 | .endm |
261 | ||
262 | .macro pixfunc2 pfx name args:vararg | |
263 | pixfunc \pfx \name | |
264 | pixfunc \pfx \name \args | |
265 | .endm | |
266 | ||
267 | function ff_put_h264_qpel16_mc00_neon, export=1 | |
12bf71b6 | 268 | mov r3, #16 |
a7e7d40c | 269 | endfunc |
569f5a75 MR |
270 | |
271 | pixfunc put_ pixels16 | |
272 | pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
273 | pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
274 | pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
275 | ||
276 | function ff_avg_h264_qpel16_mc00_neon, export=1 | |
12bf71b6 | 277 | mov r3, #16 |
a7e7d40c | 278 | endfunc |
569f5a75 MR |
279 | |
280 | pixfunc avg_ pixels16,, 1 | |
281 | ||
282 | function ff_put_h264_qpel8_mc00_neon, export=1 | |
12bf71b6 | 283 | mov r3, #8 |
a7e7d40c | 284 | endfunc |
569f5a75 MR |
285 | |
286 | pixfunc put_ pixels8 | |
287 | pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
288 | pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
289 | pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | |
f23740d9 | 290 | |
bef966e3 MR |
291 | function ff_avg_h264_qpel8_mc00_neon, export=1 |
292 | mov r3, #8 | |
a7e7d40c | 293 | endfunc |
bef966e3 MR |
294 | |
295 | pixfunc avg_ pixels8,, 1 | |
296 | ||
428bf2ac DC |
297 | function ff_put_pixels_clamped_neon, export=1 |
298 | vld1.64 {d16-d19}, [r0,:128]! | |
299 | vqmovun.s16 d0, q8 | |
300 | vld1.64 {d20-d23}, [r0,:128]! | |
301 | vqmovun.s16 d1, q9 | |
302 | vld1.64 {d24-d27}, [r0,:128]! | |
303 | vqmovun.s16 d2, q10 | |
304 | vld1.64 {d28-d31}, [r0,:128]! | |
305 | vqmovun.s16 d3, q11 | |
306 | vst1.64 {d0}, [r1,:64], r2 | |
307 | vqmovun.s16 d4, q12 | |
308 | vst1.64 {d1}, [r1,:64], r2 | |
309 | vqmovun.s16 d5, q13 | |
310 | vst1.64 {d2}, [r1,:64], r2 | |
311 | vqmovun.s16 d6, q14 | |
312 | vst1.64 {d3}, [r1,:64], r2 | |
313 | vqmovun.s16 d7, q15 | |
314 | vst1.64 {d4}, [r1,:64], r2 | |
315 | vst1.64 {d5}, [r1,:64], r2 | |
316 | vst1.64 {d6}, [r1,:64], r2 | |
317 | vst1.64 {d7}, [r1,:64], r2 | |
318 | bx lr | |
a7e7d40c | 319 | endfunc |
428bf2ac | 320 | |
cc2e5554 DC |
321 | function ff_put_signed_pixels_clamped_neon, export=1 |
322 | vmov.u8 d31, #128 | |
323 | vld1.64 {d16-d17}, [r0,:128]! | |
324 | vqmovn.s16 d0, q8 | |
325 | vld1.64 {d18-d19}, [r0,:128]! | |
326 | vqmovn.s16 d1, q9 | |
327 | vld1.64 {d16-d17}, [r0,:128]! | |
328 | vqmovn.s16 d2, q8 | |
329 | vld1.64 {d18-d19}, [r0,:128]! | |
330 | vadd.u8 d0, d0, d31 | |
331 | vld1.64 {d20-d21}, [r0,:128]! | |
332 | vadd.u8 d1, d1, d31 | |
333 | vld1.64 {d22-d23}, [r0,:128]! | |
334 | vadd.u8 d2, d2, d31 | |
335 | vst1.64 {d0}, [r1,:64], r2 | |
336 | vqmovn.s16 d3, q9 | |
337 | vst1.64 {d1}, [r1,:64], r2 | |
338 | vqmovn.s16 d4, q10 | |
339 | vst1.64 {d2}, [r1,:64], r2 | |
340 | vqmovn.s16 d5, q11 | |
341 | vld1.64 {d24-d25}, [r0,:128]! | |
342 | vadd.u8 d3, d3, d31 | |
343 | vld1.64 {d26-d27}, [r0,:128]! | |
344 | vadd.u8 d4, d4, d31 | |
345 | vadd.u8 d5, d5, d31 | |
346 | vst1.64 {d3}, [r1,:64], r2 | |
347 | vqmovn.s16 d6, q12 | |
348 | vst1.64 {d4}, [r1,:64], r2 | |
349 | vqmovn.s16 d7, q13 | |
350 | vst1.64 {d5}, [r1,:64], r2 | |
351 | vadd.u8 d6, d6, d31 | |
352 | vadd.u8 d7, d7, d31 | |
353 | vst1.64 {d6}, [r1,:64], r2 | |
354 | vst1.64 {d7}, [r1,:64], r2 | |
355 | bx lr | |
a7e7d40c | 356 | endfunc |
cc2e5554 | 357 | |
08e12b22 MR |
358 | function ff_add_pixels_clamped_neon, export=1 |
359 | mov r3, r1 | |
360 | vld1.64 {d16}, [r1,:64], r2 | |
361 | vld1.64 {d0-d1}, [r0,:128]! | |
362 | vaddw.u8 q0, q0, d16 | |
363 | vld1.64 {d17}, [r1,:64], r2 | |
364 | vld1.64 {d2-d3}, [r0,:128]! | |
365 | vqmovun.s16 d0, q0 | |
366 | vld1.64 {d18}, [r1,:64], r2 | |
367 | vaddw.u8 q1, q1, d17 | |
368 | vld1.64 {d4-d5}, [r0,:128]! | |
369 | vaddw.u8 q2, q2, d18 | |
370 | vst1.64 {d0}, [r3,:64], r2 | |
371 | vqmovun.s16 d2, q1 | |
372 | vld1.64 {d19}, [r1,:64], r2 | |
373 | vld1.64 {d6-d7}, [r0,:128]! | |
374 | vaddw.u8 q3, q3, d19 | |
375 | vqmovun.s16 d4, q2 | |
376 | vst1.64 {d2}, [r3,:64], r2 | |
377 | vld1.64 {d16}, [r1,:64], r2 | |
378 | vqmovun.s16 d6, q3 | |
379 | vld1.64 {d0-d1}, [r0,:128]! | |
380 | vaddw.u8 q0, q0, d16 | |
381 | vst1.64 {d4}, [r3,:64], r2 | |
382 | vld1.64 {d17}, [r1,:64], r2 | |
383 | vld1.64 {d2-d3}, [r0,:128]! | |
384 | vaddw.u8 q1, q1, d17 | |
385 | vst1.64 {d6}, [r3,:64], r2 | |
386 | vqmovun.s16 d0, q0 | |
387 | vld1.64 {d18}, [r1,:64], r2 | |
388 | vld1.64 {d4-d5}, [r0,:128]! | |
389 | vaddw.u8 q2, q2, d18 | |
390 | vst1.64 {d0}, [r3,:64], r2 | |
391 | vqmovun.s16 d2, q1 | |
392 | vld1.64 {d19}, [r1,:64], r2 | |
393 | vqmovun.s16 d4, q2 | |
394 | vld1.64 {d6-d7}, [r0,:128]! | |
395 | vaddw.u8 q3, q3, d19 | |
396 | vst1.64 {d2}, [r3,:64], r2 | |
397 | vqmovun.s16 d6, q3 | |
398 | vst1.64 {d4}, [r3,:64], r2 | |
399 | vst1.64 {d6}, [r3,:64], r2 | |
400 | bx lr | |
a7e7d40c | 401 | endfunc |
08e12b22 | 402 | |
f23740d9 MR |
403 | function ff_float_to_int16_neon, export=1 |
404 | subs r2, r2, #8 | |
405 | vld1.64 {d0-d1}, [r1,:128]! | |
406 | vcvt.s32.f32 q8, q0, #16 | |
407 | vld1.64 {d2-d3}, [r1,:128]! | |
408 | vcvt.s32.f32 q9, q1, #16 | |
409 | beq 3f | |
410 | bics ip, r2, #15 | |
411 | beq 2f | |
412 | 1: subs ip, ip, #16 | |
413 | vshrn.s32 d4, q8, #16 | |
414 | vld1.64 {d0-d1}, [r1,:128]! | |
415 | vcvt.s32.f32 q0, q0, #16 | |
416 | vshrn.s32 d5, q9, #16 | |
417 | vld1.64 {d2-d3}, [r1,:128]! | |
418 | vcvt.s32.f32 q1, q1, #16 | |
419 | vshrn.s32 d6, q0, #16 | |
420 | vst1.64 {d4-d5}, [r0,:128]! | |
421 | vshrn.s32 d7, q1, #16 | |
422 | vld1.64 {d16-d17},[r1,:128]! | |
423 | vcvt.s32.f32 q8, q8, #16 | |
424 | vld1.64 {d18-d19},[r1,:128]! | |
425 | vcvt.s32.f32 q9, q9, #16 | |
426 | vst1.64 {d6-d7}, [r0,:128]! | |
427 | bne 1b | |
428 | ands r2, r2, #15 | |
429 | beq 3f | |
430 | 2: vld1.64 {d0-d1}, [r1,:128]! | |
431 | vshrn.s32 d4, q8, #16 | |
432 | vcvt.s32.f32 q0, q0, #16 | |
433 | vld1.64 {d2-d3}, [r1,:128]! | |
434 | vshrn.s32 d5, q9, #16 | |
435 | vcvt.s32.f32 q1, q1, #16 | |
436 | vshrn.s32 d6, q0, #16 | |
437 | vst1.64 {d4-d5}, [r0,:128]! | |
438 | vshrn.s32 d7, q1, #16 | |
439 | vst1.64 {d6-d7}, [r0,:128]! | |
440 | bx lr | |
441 | 3: vshrn.s32 d4, q8, #16 | |
442 | vshrn.s32 d5, q9, #16 | |
443 | vst1.64 {d4-d5}, [r0,:128]! | |
444 | bx lr | |
a7e7d40c | 445 | endfunc |
f23740d9 MR |
446 | |
447 | function ff_float_to_int16_interleave_neon, export=1 | |
448 | cmp r3, #2 | |
449 | ldrlt r1, [r1] | |
450 | blt ff_float_to_int16_neon | |
451 | bne 4f | |
452 | ||
453 | ldr r3, [r1] | |
454 | ldr r1, [r1, #4] | |
455 | ||
456 | subs r2, r2, #8 | |
457 | vld1.64 {d0-d1}, [r3,:128]! | |
458 | vcvt.s32.f32 q8, q0, #16 | |
459 | vld1.64 {d2-d3}, [r3,:128]! | |
460 | vcvt.s32.f32 q9, q1, #16 | |
461 | vld1.64 {d20-d21},[r1,:128]! | |
462 | vcvt.s32.f32 q10, q10, #16 | |
463 | vld1.64 {d22-d23},[r1,:128]! | |
464 | vcvt.s32.f32 q11, q11, #16 | |
465 | beq 3f | |
466 | bics ip, r2, #15 | |
467 | beq 2f | |
468 | 1: subs ip, ip, #16 | |
469 | vld1.64 {d0-d1}, [r3,:128]! | |
470 | vcvt.s32.f32 q0, q0, #16 | |
471 | vsri.32 q10, q8, #16 | |
472 | vld1.64 {d2-d3}, [r3,:128]! | |
473 | vcvt.s32.f32 q1, q1, #16 | |
474 | vld1.64 {d24-d25},[r1,:128]! | |
475 | vcvt.s32.f32 q12, q12, #16 | |
476 | vld1.64 {d26-d27},[r1,:128]! | |
477 | vsri.32 q11, q9, #16 | |
478 | vst1.64 {d20-d21},[r0,:128]! | |
479 | vcvt.s32.f32 q13, q13, #16 | |
480 | vst1.64 {d22-d23},[r0,:128]! | |
481 | vsri.32 q12, q0, #16 | |
482 | vld1.64 {d16-d17},[r3,:128]! | |
483 | vsri.32 q13, q1, #16 | |
484 | vst1.64 {d24-d25},[r0,:128]! | |
485 | vcvt.s32.f32 q8, q8, #16 | |
486 | vld1.64 {d18-d19},[r3,:128]! | |
487 | vcvt.s32.f32 q9, q9, #16 | |
488 | vld1.64 {d20-d21},[r1,:128]! | |
489 | vcvt.s32.f32 q10, q10, #16 | |
490 | vld1.64 {d22-d23},[r1,:128]! | |
491 | vcvt.s32.f32 q11, q11, #16 | |
492 | vst1.64 {d26-d27},[r0,:128]! | |
493 | bne 1b | |
494 | ands r2, r2, #15 | |
495 | beq 3f | |
496 | 2: vsri.32 q10, q8, #16 | |
497 | vld1.64 {d0-d1}, [r3,:128]! | |
498 | vcvt.s32.f32 q0, q0, #16 | |
499 | vld1.64 {d2-d3}, [r3,:128]! | |
500 | vcvt.s32.f32 q1, q1, #16 | |
501 | vld1.64 {d24-d25},[r1,:128]! | |
502 | vcvt.s32.f32 q12, q12, #16 | |
503 | vsri.32 q11, q9, #16 | |
504 | vld1.64 {d26-d27},[r1,:128]! | |
505 | vcvt.s32.f32 q13, q13, #16 | |
506 | vst1.64 {d20-d21},[r0,:128]! | |
507 | vsri.32 q12, q0, #16 | |
508 | vst1.64 {d22-d23},[r0,:128]! | |
509 | vsri.32 q13, q1, #16 | |
510 | vst1.64 {d24-d27},[r0,:128]! | |
511 | bx lr | |
512 | 3: vsri.32 q10, q8, #16 | |
513 | vsri.32 q11, q9, #16 | |
514 | vst1.64 {d20-d23},[r0,:128]! | |
515 | bx lr | |
516 | ||
517 | 4: push {r4-r8,lr} | |
518 | cmp r3, #4 | |
519 | lsl ip, r3, #1 | |
520 | blt 4f | |
521 | ||
522 | @ 4 channels | |
523 | 5: ldmia r1!, {r4-r7} | |
524 | mov lr, r2 | |
525 | mov r8, r0 | |
526 | vld1.64 {d16-d17},[r4,:128]! | |
527 | vcvt.s32.f32 q8, q8, #16 | |
528 | vld1.64 {d18-d19},[r5,:128]! | |
529 | vcvt.s32.f32 q9, q9, #16 | |
530 | vld1.64 {d20-d21},[r6,:128]! | |
531 | vcvt.s32.f32 q10, q10, #16 | |
532 | vld1.64 {d22-d23},[r7,:128]! | |
533 | vcvt.s32.f32 q11, q11, #16 | |
534 | 6: subs lr, lr, #8 | |
535 | vld1.64 {d0-d1}, [r4,:128]! | |
536 | vcvt.s32.f32 q0, q0, #16 | |
537 | vsri.32 q9, q8, #16 | |
538 | vld1.64 {d2-d3}, [r5,:128]! | |
539 | vcvt.s32.f32 q1, q1, #16 | |
540 | vsri.32 q11, q10, #16 | |
541 | vld1.64 {d4-d5}, [r6,:128]! | |
542 | vcvt.s32.f32 q2, q2, #16 | |
543 | vzip.32 d18, d22 | |
544 | vld1.64 {d6-d7}, [r7,:128]! | |
545 | vcvt.s32.f32 q3, q3, #16 | |
546 | vzip.32 d19, d23 | |
547 | vst1.64 {d18}, [r8], ip | |
548 | vsri.32 q1, q0, #16 | |
549 | vst1.64 {d22}, [r8], ip | |
550 | vsri.32 q3, q2, #16 | |
551 | vst1.64 {d19}, [r8], ip | |
552 | vzip.32 d2, d6 | |
553 | vst1.64 {d23}, [r8], ip | |
554 | vzip.32 d3, d7 | |
555 | beq 7f | |
556 | vld1.64 {d16-d17},[r4,:128]! | |
557 | vcvt.s32.f32 q8, q8, #16 | |
558 | vst1.64 {d2}, [r8], ip | |
559 | vld1.64 {d18-d19},[r5,:128]! | |
560 | vcvt.s32.f32 q9, q9, #16 | |
561 | vst1.64 {d6}, [r8], ip | |
562 | vld1.64 {d20-d21},[r6,:128]! | |
563 | vcvt.s32.f32 q10, q10, #16 | |
564 | vst1.64 {d3}, [r8], ip | |
565 | vld1.64 {d22-d23},[r7,:128]! | |
566 | vcvt.s32.f32 q11, q11, #16 | |
567 | vst1.64 {d7}, [r8], ip | |
568 | b 6b | |
569 | 7: vst1.64 {d2}, [r8], ip | |
570 | vst1.64 {d6}, [r8], ip | |
571 | vst1.64 {d3}, [r8], ip | |
572 | vst1.64 {d7}, [r8], ip | |
573 | subs r3, r3, #4 | |
574 | popeq {r4-r8,pc} | |
575 | cmp r3, #4 | |
576 | add r0, r0, #8 | |
577 | bge 5b | |
578 | ||
579 | @ 2 channels | |
580 | 4: cmp r3, #2 | |
581 | blt 4f | |
582 | ldmia r1!, {r4-r5} | |
583 | mov lr, r2 | |
584 | mov r8, r0 | |
585 | tst lr, #8 | |
586 | vld1.64 {d16-d17},[r4,:128]! | |
587 | vcvt.s32.f32 q8, q8, #16 | |
588 | vld1.64 {d18-d19},[r5,:128]! | |
589 | vcvt.s32.f32 q9, q9, #16 | |
590 | vld1.64 {d20-d21},[r4,:128]! | |
591 | vcvt.s32.f32 q10, q10, #16 | |
592 | vld1.64 {d22-d23},[r5,:128]! | |
593 | vcvt.s32.f32 q11, q11, #16 | |
594 | beq 6f | |
595 | subs lr, lr, #8 | |
596 | beq 7f | |
597 | vsri.32 d18, d16, #16 | |
598 | vsri.32 d19, d17, #16 | |
599 | vld1.64 {d16-d17},[r4,:128]! | |
600 | vcvt.s32.f32 q8, q8, #16 | |
601 | vst1.32 {d18[0]}, [r8], ip | |
602 | vsri.32 d22, d20, #16 | |
603 | vst1.32 {d18[1]}, [r8], ip | |
604 | vsri.32 d23, d21, #16 | |
605 | vst1.32 {d19[0]}, [r8], ip | |
606 | vst1.32 {d19[1]}, [r8], ip | |
607 | vld1.64 {d18-d19},[r5,:128]! | |
608 | vcvt.s32.f32 q9, q9, #16 | |
609 | vst1.32 {d22[0]}, [r8], ip | |
610 | vst1.32 {d22[1]}, [r8], ip | |
611 | vld1.64 {d20-d21},[r4,:128]! | |
612 | vcvt.s32.f32 q10, q10, #16 | |
613 | vst1.32 {d23[0]}, [r8], ip | |
614 | vst1.32 {d23[1]}, [r8], ip | |
615 | vld1.64 {d22-d23},[r5,:128]! | |
616 | vcvt.s32.f32 q11, q11, #16 | |
617 | 6: subs lr, lr, #16 | |
618 | vld1.64 {d0-d1}, [r4,:128]! | |
619 | vcvt.s32.f32 q0, q0, #16 | |
620 | vsri.32 d18, d16, #16 | |
621 | vld1.64 {d2-d3}, [r5,:128]! | |
622 | vcvt.s32.f32 q1, q1, #16 | |
623 | vsri.32 d19, d17, #16 | |
624 | vld1.64 {d4-d5}, [r4,:128]! | |
625 | vcvt.s32.f32 q2, q2, #16 | |
626 | vld1.64 {d6-d7}, [r5,:128]! | |
627 | vcvt.s32.f32 q3, q3, #16 | |
628 | vst1.32 {d18[0]}, [r8], ip | |
629 | vsri.32 d22, d20, #16 | |
630 | vst1.32 {d18[1]}, [r8], ip | |
631 | vsri.32 d23, d21, #16 | |
632 | vst1.32 {d19[0]}, [r8], ip | |
633 | vsri.32 d2, d0, #16 | |
634 | vst1.32 {d19[1]}, [r8], ip | |
635 | vsri.32 d3, d1, #16 | |
636 | vst1.32 {d22[0]}, [r8], ip | |
637 | vsri.32 d6, d4, #16 | |
638 | vst1.32 {d22[1]}, [r8], ip | |
639 | vsri.32 d7, d5, #16 | |
640 | vst1.32 {d23[0]}, [r8], ip | |
641 | vst1.32 {d23[1]}, [r8], ip | |
642 | beq 6f | |
643 | vld1.64 {d16-d17},[r4,:128]! | |
644 | vcvt.s32.f32 q8, q8, #16 | |
645 | vst1.32 {d2[0]}, [r8], ip | |
646 | vst1.32 {d2[1]}, [r8], ip | |
647 | vld1.64 {d18-d19},[r5,:128]! | |
648 | vcvt.s32.f32 q9, q9, #16 | |
649 | vst1.32 {d3[0]}, [r8], ip | |
650 | vst1.32 {d3[1]}, [r8], ip | |
651 | vld1.64 {d20-d21},[r4,:128]! | |
652 | vcvt.s32.f32 q10, q10, #16 | |
653 | vst1.32 {d6[0]}, [r8], ip | |
654 | vst1.32 {d6[1]}, [r8], ip | |
655 | vld1.64 {d22-d23},[r5,:128]! | |
656 | vcvt.s32.f32 q11, q11, #16 | |
657 | vst1.32 {d7[0]}, [r8], ip | |
658 | vst1.32 {d7[1]}, [r8], ip | |
659 | bgt 6b | |
660 | 6: vst1.32 {d2[0]}, [r8], ip | |
661 | vst1.32 {d2[1]}, [r8], ip | |
662 | vst1.32 {d3[0]}, [r8], ip | |
663 | vst1.32 {d3[1]}, [r8], ip | |
664 | vst1.32 {d6[0]}, [r8], ip | |
665 | vst1.32 {d6[1]}, [r8], ip | |
666 | vst1.32 {d7[0]}, [r8], ip | |
667 | vst1.32 {d7[1]}, [r8], ip | |
668 | b 8f | |
669 | 7: vsri.32 d18, d16, #16 | |
670 | vsri.32 d19, d17, #16 | |
671 | vst1.32 {d18[0]}, [r8], ip | |
672 | vsri.32 d22, d20, #16 | |
673 | vst1.32 {d18[1]}, [r8], ip | |
674 | vsri.32 d23, d21, #16 | |
675 | vst1.32 {d19[0]}, [r8], ip | |
676 | vst1.32 {d19[1]}, [r8], ip | |
677 | vst1.32 {d22[0]}, [r8], ip | |
678 | vst1.32 {d22[1]}, [r8], ip | |
679 | vst1.32 {d23[0]}, [r8], ip | |
680 | vst1.32 {d23[1]}, [r8], ip | |
681 | 8: subs r3, r3, #2 | |
682 | add r0, r0, #4 | |
683 | popeq {r4-r8,pc} | |
684 | ||
685 | @ 1 channel | |
686 | 4: ldr r4, [r1],#4 | |
687 | tst r2, #8 | |
688 | mov lr, r2 | |
689 | mov r5, r0 | |
690 | vld1.64 {d0-d1}, [r4,:128]! | |
691 | vcvt.s32.f32 q0, q0, #16 | |
692 | vld1.64 {d2-d3}, [r4,:128]! | |
693 | vcvt.s32.f32 q1, q1, #16 | |
694 | bne 8f | |
695 | 6: subs lr, lr, #16 | |
696 | vld1.64 {d4-d5}, [r4,:128]! | |
697 | vcvt.s32.f32 q2, q2, #16 | |
698 | vld1.64 {d6-d7}, [r4,:128]! | |
699 | vcvt.s32.f32 q3, q3, #16 | |
700 | vst1.16 {d0[1]}, [r5,:16], ip | |
701 | vst1.16 {d0[3]}, [r5,:16], ip | |
702 | vst1.16 {d1[1]}, [r5,:16], ip | |
703 | vst1.16 {d1[3]}, [r5,:16], ip | |
704 | vst1.16 {d2[1]}, [r5,:16], ip | |
705 | vst1.16 {d2[3]}, [r5,:16], ip | |
706 | vst1.16 {d3[1]}, [r5,:16], ip | |
707 | vst1.16 {d3[3]}, [r5,:16], ip | |
708 | beq 7f | |
709 | vld1.64 {d0-d1}, [r4,:128]! | |
710 | vcvt.s32.f32 q0, q0, #16 | |
711 | vld1.64 {d2-d3}, [r4,:128]! | |
712 | vcvt.s32.f32 q1, q1, #16 | |
713 | 7: vst1.16 {d4[1]}, [r5,:16], ip | |
714 | vst1.16 {d4[3]}, [r5,:16], ip | |
715 | vst1.16 {d5[1]}, [r5,:16], ip | |
716 | vst1.16 {d5[3]}, [r5,:16], ip | |
717 | vst1.16 {d6[1]}, [r5,:16], ip | |
718 | vst1.16 {d6[3]}, [r5,:16], ip | |
719 | vst1.16 {d7[1]}, [r5,:16], ip | |
720 | vst1.16 {d7[3]}, [r5,:16], ip | |
721 | bgt 6b | |
722 | pop {r4-r8,pc} | |
723 | 8: subs lr, lr, #8 | |
724 | vst1.16 {d0[1]}, [r5,:16], ip | |
725 | vst1.16 {d0[3]}, [r5,:16], ip | |
726 | vst1.16 {d1[1]}, [r5,:16], ip | |
727 | vst1.16 {d1[3]}, [r5,:16], ip | |
728 | vst1.16 {d2[1]}, [r5,:16], ip | |
729 | vst1.16 {d2[3]}, [r5,:16], ip | |
730 | vst1.16 {d3[1]}, [r5,:16], ip | |
731 | vst1.16 {d3[3]}, [r5,:16], ip | |
732 | popeq {r4-r8,pc} | |
733 | vld1.64 {d0-d1}, [r4,:128]! | |
734 | vcvt.s32.f32 q0, q0, #16 | |
735 | vld1.64 {d2-d3}, [r4,:128]! | |
736 | vcvt.s32.f32 q1, q1, #16 | |
737 | b 6b | |
a7e7d40c | 738 | endfunc |
dd927e2e MR |
739 | |
740 | function ff_vector_fmul_neon, export=1 | |
741 | mov r3, r0 | |
742 | subs r2, r2, #8 | |
743 | vld1.64 {d0-d3}, [r0,:128]! | |
744 | vld1.64 {d4-d7}, [r1,:128]! | |
745 | vmul.f32 q8, q0, q2 | |
746 | vmul.f32 q9, q1, q3 | |
747 | beq 3f | |
748 | bics ip, r2, #15 | |
749 | beq 2f | |
750 | 1: subs ip, ip, #16 | |
751 | vld1.64 {d0-d1}, [r0,:128]! | |
752 | vld1.64 {d4-d5}, [r1,:128]! | |
753 | vmul.f32 q10, q0, q2 | |
754 | vld1.64 {d2-d3}, [r0,:128]! | |
755 | vld1.64 {d6-d7}, [r1,:128]! | |
756 | vmul.f32 q11, q1, q3 | |
757 | vst1.64 {d16-d19},[r3,:128]! | |
758 | vld1.64 {d0-d1}, [r0,:128]! | |
759 | vld1.64 {d4-d5}, [r1,:128]! | |
760 | vmul.f32 q8, q0, q2 | |
761 | vld1.64 {d2-d3}, [r0,:128]! | |
762 | vld1.64 {d6-d7}, [r1,:128]! | |
763 | vmul.f32 q9, q1, q3 | |
764 | vst1.64 {d20-d23},[r3,:128]! | |
765 | bne 1b | |
766 | ands r2, r2, #15 | |
767 | beq 3f | |
768 | 2: vld1.64 {d0-d1}, [r0,:128]! | |
769 | vld1.64 {d4-d5}, [r1,:128]! | |
770 | vst1.64 {d16-d17},[r3,:128]! | |
771 | vmul.f32 q8, q0, q2 | |
772 | vld1.64 {d2-d3}, [r0,:128]! | |
773 | vld1.64 {d6-d7}, [r1,:128]! | |
774 | vst1.64 {d18-d19},[r3,:128]! | |
775 | vmul.f32 q9, q1, q3 | |
776 | 3: vst1.64 {d16-d19},[r3,:128]! | |
777 | bx lr | |
a7e7d40c | 778 | endfunc |
e1f7cb7f MR |
779 | |
780 | function ff_vector_fmul_window_neon, export=1 | |
0a8958c8 MR |
781 | VFP vdup.32 q8, d0[0] |
782 | NOVFP vld1.32 {d16[],d17[]}, [sp,:32] | |
e1f7cb7f | 783 | push {r4,r5,lr} |
0a8958c8 MR |
784 | VFP ldr lr, [sp, #12] |
785 | NOVFP ldr lr, [sp, #16] | |
e1f7cb7f MR |
786 | sub r2, r2, #8 |
787 | sub r5, lr, #2 | |
788 | add r2, r2, r5, lsl #2 | |
789 | add r4, r3, r5, lsl #3 | |
790 | add ip, r0, r5, lsl #3 | |
791 | mov r5, #-16 | |
792 | vld1.64 {d0,d1}, [r1,:128]! | |
793 | vld1.64 {d2,d3}, [r2,:128], r5 | |
794 | vld1.64 {d4,d5}, [r3,:128]! | |
795 | vld1.64 {d6,d7}, [r4,:128], r5 | |
796 | 1: subs lr, lr, #4 | |
797 | vmov q11, q8 | |
798 | vmla.f32 d22, d0, d4 | |
799 | vmov q10, q8 | |
800 | vmla.f32 d23, d1, d5 | |
801 | vrev64.32 q3, q3 | |
802 | vmla.f32 d20, d0, d7 | |
803 | vrev64.32 q1, q1 | |
804 | vmla.f32 d21, d1, d6 | |
805 | beq 2f | |
806 | vmla.f32 d22, d3, d7 | |
807 | vld1.64 {d0,d1}, [r1,:128]! | |
808 | vmla.f32 d23, d2, d6 | |
809 | vld1.64 {d18,d19},[r2,:128], r5 | |
810 | vmls.f32 d20, d3, d4 | |
811 | vld1.64 {d24,d25},[r3,:128]! | |
812 | vmls.f32 d21, d2, d5 | |
813 | vld1.64 {d6,d7}, [r4,:128], r5 | |
814 | vmov q1, q9 | |
815 | vrev64.32 q11, q11 | |
816 | vmov q2, q12 | |
817 | vswp d22, d23 | |
818 | vst1.64 {d20,d21},[r0,:128]! | |
819 | vst1.64 {d22,d23},[ip,:128], r5 | |
820 | b 1b | |
821 | 2: vmla.f32 d22, d3, d7 | |
822 | vmla.f32 d23, d2, d6 | |
823 | vmls.f32 d20, d3, d4 | |
824 | vmls.f32 d21, d2, d5 | |
825 | vrev64.32 q11, q11 | |
826 | vswp d22, d23 | |
827 | vst1.64 {d20,d21},[r0,:128]! | |
828 | vst1.64 {d22,d23},[ip,:128], r5 | |
829 | pop {r4,r5,pc} | |
a7e7d40c | 830 | endfunc |
e814015d MR |
831 | |
832 | #if CONFIG_VORBIS_DECODER | |
833 | function ff_vorbis_inverse_coupling_neon, export=1 | |
834 | vmov.i32 q10, #1<<31 | |
835 | subs r2, r2, #4 | |
e814015d MR |
836 | mov r3, r0 |
837 | mov r12, r1 | |
838 | beq 3f | |
839 | ||
840 | vld1.32 {d24-d25},[r1,:128]! | |
841 | vld1.32 {d22-d23},[r0,:128]! | |
842 | vcle.s32 q8, q12, #0 | |
843 | vand q9, q11, q10 | |
844 | veor q12, q12, q9 | |
845 | vand q2, q12, q8 | |
846 | vbic q3, q12, q8 | |
847 | vadd.f32 q12, q11, q2 | |
848 | vsub.f32 q11, q11, q3 | |
849 | 1: vld1.32 {d2-d3}, [r1,:128]! | |
850 | vld1.32 {d0-d1}, [r0,:128]! | |
851 | vcle.s32 q8, q1, #0 | |
852 | vand q9, q0, q10 | |
853 | veor q1, q1, q9 | |
854 | vst1.32 {d24-d25},[r3, :128]! | |
855 | vst1.32 {d22-d23},[r12,:128]! | |
856 | vand q2, q1, q8 | |
857 | vbic q3, q1, q8 | |
858 | vadd.f32 q1, q0, q2 | |
859 | vsub.f32 q0, q0, q3 | |
860 | subs r2, r2, #8 | |
861 | ble 2f | |
862 | vld1.32 {d24-d25},[r1,:128]! | |
863 | vld1.32 {d22-d23},[r0,:128]! | |
864 | vcle.s32 q8, q12, #0 | |
865 | vand q9, q11, q10 | |
866 | veor q12, q12, q9 | |
867 | vst1.32 {d2-d3}, [r3, :128]! | |
868 | vst1.32 {d0-d1}, [r12,:128]! | |
869 | vand q2, q12, q8 | |
870 | vbic q3, q12, q8 | |
871 | vadd.f32 q12, q11, q2 | |
872 | vsub.f32 q11, q11, q3 | |
873 | b 1b | |
874 | ||
875 | 2: vst1.32 {d2-d3}, [r3, :128]! | |
876 | vst1.32 {d0-d1}, [r12,:128]! | |
877 | bxlt lr | |
878 | ||
879 | 3: vld1.32 {d2-d3}, [r1,:128] | |
880 | vld1.32 {d0-d1}, [r0,:128] | |
881 | vcle.s32 q8, q1, #0 | |
882 | vand q9, q0, q10 | |
883 | veor q1, q1, q9 | |
884 | vand q2, q1, q8 | |
885 | vbic q3, q1, q8 | |
886 | vadd.f32 q1, q0, q2 | |
887 | vsub.f32 q0, q0, q3 | |
888 | vst1.32 {d2-d3}, [r0,:128]! | |
889 | vst1.32 {d0-d1}, [r1,:128]! | |
890 | bx lr | |
a7e7d40c | 891 | endfunc |
e814015d | 892 | #endif |
1dee3e97 MR |
893 | |
894 | function ff_vector_fmul_scalar_neon, export=1 | |
895 | VFP len .req r2 | |
896 | NOVFP len .req r3 | |
897 | VFP vdup.32 q8, d0[0] | |
898 | NOVFP vdup.32 q8, r2 | |
899 | bics r12, len, #15 | |
900 | beq 3f | |
901 | vld1.32 {q0},[r1,:128]! | |
902 | vld1.32 {q1},[r1,:128]! | |
903 | 1: vmul.f32 q0, q0, q8 | |
904 | vld1.32 {q2},[r1,:128]! | |
905 | vmul.f32 q1, q1, q8 | |
906 | vld1.32 {q3},[r1,:128]! | |
907 | vmul.f32 q2, q2, q8 | |
908 | vst1.32 {q0},[r0,:128]! | |
909 | vmul.f32 q3, q3, q8 | |
910 | vst1.32 {q1},[r0,:128]! | |
911 | subs r12, r12, #16 | |
912 | beq 2f | |
913 | vld1.32 {q0},[r1,:128]! | |
914 | vst1.32 {q2},[r0,:128]! | |
915 | vld1.32 {q1},[r1,:128]! | |
916 | vst1.32 {q3},[r0,:128]! | |
917 | b 1b | |
918 | 2: vst1.32 {q2},[r0,:128]! | |
919 | vst1.32 {q3},[r0,:128]! | |
920 | ands len, len, #15 | |
921 | bxeq lr | |
922 | 3: vld1.32 {q0},[r1,:128]! | |
923 | vmul.f32 q0, q0, q8 | |
924 | vst1.32 {q0},[r0,:128]! | |
925 | subs len, len, #4 | |
926 | bgt 3b | |
927 | bx lr | |
928 | .unreq len | |
a7e7d40c | 929 | endfunc |
1dee3e97 MR |
930 | |
931 | function ff_vector_fmul_sv_scalar_2_neon, export=1 | |
932 | VFP vdup.32 d16, d0[0] | |
933 | NOVFP vdup.32 d16, r3 | |
934 | NOVFP ldr r3, [sp] | |
935 | vld1.32 {d0},[r1,:64]! | |
936 | vld1.32 {d1},[r1,:64]! | |
937 | 1: subs r3, r3, #4 | |
938 | vmul.f32 d4, d0, d16 | |
939 | vmul.f32 d5, d1, d16 | |
940 | ldr r12, [r2], #4 | |
941 | vld1.32 {d2},[r12,:64] | |
942 | ldr r12, [r2], #4 | |
943 | vld1.32 {d3},[r12,:64] | |
944 | vmul.f32 d4, d4, d2 | |
945 | vmul.f32 d5, d5, d3 | |
946 | beq 2f | |
947 | vld1.32 {d0},[r1,:64]! | |
948 | vld1.32 {d1},[r1,:64]! | |
949 | vst1.32 {d4},[r0,:64]! | |
950 | vst1.32 {d5},[r0,:64]! | |
951 | b 1b | |
952 | 2: vst1.32 {d4},[r0,:64]! | |
953 | vst1.32 {d5},[r0,:64]! | |
954 | bx lr | |
a7e7d40c | 955 | endfunc |
1dee3e97 MR |
956 | |
957 | function ff_vector_fmul_sv_scalar_4_neon, export=1 | |
958 | VFP vdup.32 q10, d0[0] | |
959 | NOVFP vdup.32 q10, r3 | |
960 | NOVFP ldr r3, [sp] | |
961 | push {lr} | |
962 | bics lr, r3, #7 | |
963 | beq 3f | |
964 | vld1.32 {q0},[r1,:128]! | |
965 | vld1.32 {q2},[r1,:128]! | |
966 | 1: ldr r12, [r2], #4 | |
967 | vld1.32 {q1},[r12,:128] | |
968 | ldr r12, [r2], #4 | |
969 | vld1.32 {q3},[r12,:128] | |
970 | vmul.f32 q8, q0, q10 | |
971 | vmul.f32 q8, q8, q1 | |
972 | vmul.f32 q9, q2, q10 | |
973 | vmul.f32 q9, q9, q3 | |
974 | subs lr, lr, #8 | |
975 | beq 2f | |
976 | vld1.32 {q0},[r1,:128]! | |
977 | vld1.32 {q2},[r1,:128]! | |
978 | vst1.32 {q8},[r0,:128]! | |
979 | vst1.32 {q9},[r0,:128]! | |
980 | b 1b | |
981 | 2: vst1.32 {q8},[r0,:128]! | |
982 | vst1.32 {q9},[r0,:128]! | |
983 | ands r3, r3, #7 | |
984 | popeq {pc} | |
985 | 3: vld1.32 {q0},[r1,:128]! | |
986 | ldr r12, [r2], #4 | |
987 | vld1.32 {q1},[r12,:128] | |
988 | vmul.f32 q0, q0, q10 | |
989 | vmul.f32 q0, q0, q1 | |
990 | vst1.32 {q0},[r0,:128]! | |
991 | subs r3, r3, #4 | |
992 | bgt 3b | |
993 | pop {pc} | |
a7e7d40c | 994 | endfunc |
1dee3e97 MR |
995 | |
996 | function ff_sv_fmul_scalar_2_neon, export=1 | |
997 | VFP len .req r2 | |
998 | NOVFP len .req r3 | |
999 | VFP vdup.32 q8, d0[0] | |
1000 | NOVFP vdup.32 q8, r2 | |
1001 | ldr r12, [r1], #4 | |
1002 | vld1.32 {d0},[r12,:64] | |
1003 | ldr r12, [r1], #4 | |
1004 | vld1.32 {d1},[r12,:64] | |
1005 | 1: vmul.f32 q1, q0, q8 | |
1006 | subs len, len, #4 | |
1007 | beq 2f | |
1008 | ldr r12, [r1], #4 | |
1009 | vld1.32 {d0},[r12,:64] | |
1010 | ldr r12, [r1], #4 | |
1011 | vld1.32 {d1},[r12,:64] | |
1012 | vst1.32 {q1},[r0,:128]! | |
1013 | b 1b | |
1014 | 2: vst1.32 {q1},[r0,:128]! | |
1015 | bx lr | |
1016 | .unreq len | |
a7e7d40c | 1017 | endfunc |
1dee3e97 MR |
1018 | |
1019 | function ff_sv_fmul_scalar_4_neon, export=1 | |
1020 | VFP len .req r2 | |
1021 | NOVFP len .req r3 | |
1022 | VFP vdup.32 q8, d0[0] | |
1023 | NOVFP vdup.32 q8, r2 | |
1024 | 1: ldr r12, [r1], #4 | |
1025 | vld1.32 {q0},[r12,:128] | |
1026 | vmul.f32 q0, q0, q8 | |
1027 | vst1.32 {q0},[r0,:128]! | |
1028 | subs len, len, #4 | |
1029 | bgt 1b | |
1030 | bx lr | |
1031 | .unreq len | |
a7e7d40c | 1032 | endfunc |
1dee3e97 MR |
1033 | |
1034 | function ff_butterflies_float_neon, export=1 | |
1035 | 1: vld1.32 {q0},[r0,:128] | |
1036 | vld1.32 {q1},[r1,:128] | |
1037 | vsub.f32 q2, q0, q1 | |
1038 | vadd.f32 q1, q0, q1 | |
1039 | vst1.32 {q2},[r1,:128]! | |
1040 | vst1.32 {q1},[r0,:128]! | |
1041 | subs r2, r2, #4 | |
1042 | bgt 1b | |
1043 | bx lr | |
a7e7d40c | 1044 | endfunc |
275cfd15 MR |
1045 | |
1046 | function ff_scalarproduct_float_neon, export=1 | |
1047 | vmov.f32 q2, #0.0 | |
1048 | 1: vld1.32 {q0},[r0,:128]! | |
1049 | vld1.32 {q1},[r1,:128]! | |
1050 | vmla.f32 q2, q0, q1 | |
1051 | subs r2, r2, #4 | |
1052 | bgt 1b | |
1053 | vadd.f32 d0, d4, d5 | |
1054 | vpadd.f32 d0, d0, d0 | |
1055 | NOVFP vmov.32 r0, d0[0] | |
1056 | bx lr | |
a7e7d40c | 1057 | endfunc |
9bda7f30 MR |
1058 | |
1059 | function ff_int32_to_float_fmul_scalar_neon, export=1 | |
1060 | VFP vdup.32 q0, d0[0] | |
1061 | VFP len .req r2 | |
1062 | NOVFP vdup.32 q0, r2 | |
1063 | NOVFP len .req r3 | |
1064 | ||
1065 | vld1.32 {q1},[r1,:128]! | |
1066 | vcvt.f32.s32 q3, q1 | |
1067 | vld1.32 {q2},[r1,:128]! | |
1068 | vcvt.f32.s32 q8, q2 | |
1069 | 1: subs len, len, #8 | |
1070 | pld [r1, #16] | |
1071 | vmul.f32 q9, q3, q0 | |
1072 | vmul.f32 q10, q8, q0 | |
1073 | beq 2f | |
1074 | vld1.32 {q1},[r1,:128]! | |
1075 | vcvt.f32.s32 q3, q1 | |
1076 | vld1.32 {q2},[r1,:128]! | |
1077 | vcvt.f32.s32 q8, q2 | |
1078 | vst1.32 {q9}, [r0,:128]! | |
1079 | vst1.32 {q10},[r0,:128]! | |
1080 | b 1b | |
1081 | 2: vst1.32 {q9}, [r0,:128]! | |
1082 | vst1.32 {q10},[r0,:128]! | |
1083 | bx lr | |
1084 | .unreq len | |
a7e7d40c | 1085 | endfunc |
b9b1ad9c MR |
1086 | |
1087 | function ff_vector_fmul_reverse_neon, export=1 | |
1088 | add r2, r2, r3, lsl #2 | |
1089 | sub r2, r2, #32 | |
1090 | mov r12, #-32 | |
1091 | vld1.32 {q0-q1}, [r1,:128]! | |
1092 | vld1.32 {q2-q3}, [r2,:128], r12 | |
1093 | 1: pld [r1, #32] | |
1094 | vrev64.32 q3, q3 | |
1095 | vmul.f32 d16, d0, d7 | |
1096 | vmul.f32 d17, d1, d6 | |
1097 | pld [r2, #-32] | |
1098 | vrev64.32 q2, q2 | |
1099 | vmul.f32 d18, d2, d5 | |
1100 | vmul.f32 d19, d3, d4 | |
1101 | subs r3, r3, #8 | |
1102 | beq 2f | |
1103 | vld1.32 {q0-q1}, [r1,:128]! | |
1104 | vld1.32 {q2-q3}, [r2,:128], r12 | |
1105 | vst1.32 {q8-q9}, [r0,:128]! | |
1106 | b 1b | |
1107 | 2: vst1.32 {q8-q9}, [r0,:128]! | |
1108 | bx lr | |
a7e7d40c | 1109 | endfunc |
f331cec4 | 1110 | |
ec71a8e0 MR |
1111 | function ff_vector_fmul_add_neon, export=1 |
1112 | ldr r12, [sp] | |
1113 | vld1.32 {q0-q1}, [r1,:128]! | |
1114 | vld1.32 {q8-q9}, [r2,:128]! | |
1115 | vld1.32 {q2-q3}, [r3,:128]! | |
1116 | vmul.f32 q10, q0, q8 | |
1117 | vmul.f32 q11, q1, q9 | |
1118 | 1: vadd.f32 q12, q2, q10 | |
1119 | vadd.f32 q13, q3, q11 | |
1120 | pld [r1, #16] | |
1121 | pld [r2, #16] | |
1122 | pld [r3, #16] | |
1123 | subs r12, r12, #8 | |
1124 | beq 2f | |
1125 | vld1.32 {q0}, [r1,:128]! | |
1126 | vld1.32 {q8}, [r2,:128]! | |
1127 | vmul.f32 q10, q0, q8 | |
1128 | vld1.32 {q1}, [r1,:128]! | |
1129 | vld1.32 {q9}, [r2,:128]! | |
1130 | vmul.f32 q11, q1, q9 | |
1131 | vld1.32 {q2-q3}, [r3,:128]! | |
1132 | vst1.32 {q12-q13},[r0,:128]! | |
1133 | b 1b | |
1134 | 2: vst1.32 {q12-q13},[r0,:128]! | |
1135 | bx lr | |
a7e7d40c | 1136 | endfunc |
ec71a8e0 | 1137 | |
f331cec4 MR |
1138 | function ff_vector_clipf_neon, export=1 |
1139 | VFP vdup.32 q1, d0[1] | |
1140 | VFP vdup.32 q0, d0[0] | |
1141 | NOVFP vdup.32 q0, r2 | |
1142 | NOVFP vdup.32 q1, r3 | |
1143 | NOVFP ldr r2, [sp] | |
1144 | vld1.f32 {q2},[r1,:128]! | |
1145 | vmin.f32 q10, q2, q1 | |
1146 | vld1.f32 {q3},[r1,:128]! | |
1147 | vmin.f32 q11, q3, q1 | |
1148 | 1: vmax.f32 q8, q10, q0 | |
1149 | vmax.f32 q9, q11, q0 | |
1150 | subs r2, r2, #8 | |
1151 | beq 2f | |
1152 | vld1.f32 {q2},[r1,:128]! | |
1153 | vmin.f32 q10, q2, q1 | |
1154 | vld1.f32 {q3},[r1,:128]! | |
1155 | vmin.f32 q11, q3, q1 | |
1156 | vst1.f32 {q8},[r0,:128]! | |
1157 | vst1.f32 {q9},[r0,:128]! | |
1158 | b 1b | |
1159 | 2: vst1.f32 {q8},[r0,:128]! | |
1160 | vst1.f32 {q9},[r0,:128]! | |
1161 | bx lr | |
a7e7d40c | 1162 | endfunc |