ARM: NEON clear_block[s]
[libav.git] / libavcodec / arm / dsputil_neon.S
CommitLineData
569f5a75
MR
1/*
2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
e814015d 22#include "config.h"
569f5a75
MR
23#include "asm.S"
24
25 preserve8
569f5a75
MR
26 .text
27
73404a44
MR
28function ff_clear_block_neon, export=1
29 vmov.i16 q0, #0
30 .rept 8
31 vst1.16 {q0}, [r0,:128]!
32 .endr
33 bx lr
34endfunc
35
36function ff_clear_blocks_neon, export=1
37 vmov.i16 q0, #0
38 .rept 8*6
39 vst1.16 {q0}, [r0,:128]!
40 .endr
41 bx lr
42endfunc
43
569f5a75
MR
44 .macro pixels16 avg=0
45.if \avg
46 mov ip, r0
47.endif
481: vld1.64 {d0, d1}, [r1], r2
49 vld1.64 {d2, d3}, [r1], r2
50 vld1.64 {d4, d5}, [r1], r2
51 pld [r1, r2, lsl #2]
52 vld1.64 {d6, d7}, [r1], r2
53 pld [r1]
54 pld [r1, r2]
55 pld [r1, r2, lsl #1]
56.if \avg
d8f3f340 57 vld1.64 {d16,d17}, [ip,:128], r2
569f5a75 58 vrhadd.u8 q0, q0, q8
d8f3f340 59 vld1.64 {d18,d19}, [ip,:128], r2
569f5a75 60 vrhadd.u8 q1, q1, q9
d8f3f340 61 vld1.64 {d20,d21}, [ip,:128], r2
569f5a75 62 vrhadd.u8 q2, q2, q10
d8f3f340 63 vld1.64 {d22,d23}, [ip,:128], r2
569f5a75
MR
64 vrhadd.u8 q3, q3, q11
65.endif
66 subs r3, r3, #4
67 vst1.64 {d0, d1}, [r0,:128], r2
68 vst1.64 {d2, d3}, [r0,:128], r2
69 vst1.64 {d4, d5}, [r0,:128], r2
70 vst1.64 {d6, d7}, [r0,:128], r2
71 bne 1b
72 bx lr
73 .endm
74
75 .macro pixels16_x2 vhadd=vrhadd.u8
761: vld1.64 {d0-d2}, [r1], r2
77 vld1.64 {d4-d6}, [r1], r2
78 pld [r1]
79 pld [r1, r2]
80 subs r3, r3, #2
81 vext.8 q1, q0, q1, #1
82 \vhadd q0, q0, q1
83 vext.8 q3, q2, q3, #1
84 \vhadd q2, q2, q3
85 vst1.64 {d0, d1}, [r0,:128], r2
86 vst1.64 {d4, d5}, [r0,:128], r2
87 bne 1b
88 bx lr
89 .endm
90
91 .macro pixels16_y2 vhadd=vrhadd.u8
d647ed78
DC
92 vld1.64 {d0, d1}, [r1], r2
93 vld1.64 {d2, d3}, [r1], r2
569f5a75
MR
941: subs r3, r3, #2
95 \vhadd q2, q0, q1
d647ed78 96 vld1.64 {d0, d1}, [r1], r2
569f5a75 97 \vhadd q3, q0, q1
d647ed78 98 vld1.64 {d2, d3}, [r1], r2
569f5a75 99 pld [r1]
d647ed78 100 pld [r1, r2]
569f5a75
MR
101 vst1.64 {d4, d5}, [r0,:128], r2
102 vst1.64 {d6, d7}, [r0,:128], r2
103 bne 1b
d647ed78 104 bx lr
569f5a75
MR
105 .endm
106
107 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
d647ed78
DC
108 vld1.64 {d0-d2}, [r1], r2
109 vld1.64 {d4-d6}, [r1], r2
569f5a75
MR
110.if \no_rnd
111 vmov.i16 q13, #1
112.endif
113 pld [r1]
d647ed78 114 pld [r1, r2]
569f5a75
MR
115 vext.8 q1, q0, q1, #1
116 vext.8 q3, q2, q3, #1
117 vaddl.u8 q8, d0, d2
118 vaddl.u8 q10, d1, d3
119 vaddl.u8 q9, d4, d6
120 vaddl.u8 q11, d5, d7
1211: subs r3, r3, #2
d647ed78 122 vld1.64 {d0-d2}, [r1], r2
569f5a75
MR
123 vadd.u16 q12, q8, q9
124 pld [r1]
125.if \no_rnd
126 vadd.u16 q12, q12, q13
127.endif
128 vext.8 q15, q0, q1, #1
129 vadd.u16 q1 , q10, q11
130 \vshrn d28, q12, #2
131.if \no_rnd
132 vadd.u16 q1, q1, q13
133.endif
134 \vshrn d29, q1, #2
135 vaddl.u8 q8, d0, d30
d647ed78 136 vld1.64 {d2-d4}, [r1], r2
569f5a75
MR
137 vaddl.u8 q10, d1, d31
138 vst1.64 {d28,d29}, [r0,:128], r2
139 vadd.u16 q12, q8, q9
d647ed78 140 pld [r1, r2]
569f5a75
MR
141.if \no_rnd
142 vadd.u16 q12, q12, q13
143.endif
144 vext.8 q2, q1, q2, #1
145 vadd.u16 q0, q10, q11
146 \vshrn d30, q12, #2
147.if \no_rnd
148 vadd.u16 q0, q0, q13
149.endif
150 \vshrn d31, q0, #2
151 vaddl.u8 q9, d2, d4
152 vaddl.u8 q11, d3, d5
153 vst1.64 {d30,d31}, [r0,:128], r2
154 bgt 1b
d647ed78 155 bx lr
569f5a75
MR
156 .endm
157
bef966e3 158 .macro pixels8 avg=0
569f5a75
MR
1591: vld1.64 {d0}, [r1], r2
160 vld1.64 {d1}, [r1], r2
161 vld1.64 {d2}, [r1], r2
162 pld [r1, r2, lsl #2]
163 vld1.64 {d3}, [r1], r2
164 pld [r1]
165 pld [r1, r2]
166 pld [r1, r2, lsl #1]
bef966e3
MR
167.if \avg
168 vld1.64 {d4}, [r0,:64], r2
169 vrhadd.u8 d0, d0, d4
170 vld1.64 {d5}, [r0,:64], r2
171 vrhadd.u8 d1, d1, d5
172 vld1.64 {d6}, [r0,:64], r2
173 vrhadd.u8 d2, d2, d6
174 vld1.64 {d7}, [r0,:64], r2
175 vrhadd.u8 d3, d3, d7
176 sub r0, r0, r2, lsl #2
177.endif
569f5a75
MR
178 subs r3, r3, #4
179 vst1.64 {d0}, [r0,:64], r2
180 vst1.64 {d1}, [r0,:64], r2
181 vst1.64 {d2}, [r0,:64], r2
182 vst1.64 {d3}, [r0,:64], r2
183 bne 1b
184 bx lr
185 .endm
186
187 .macro pixels8_x2 vhadd=vrhadd.u8
1881: vld1.64 {d0, d1}, [r1], r2
189 vext.8 d1, d0, d1, #1
190 vld1.64 {d2, d3}, [r1], r2
191 vext.8 d3, d2, d3, #1
192 pld [r1]
193 pld [r1, r2]
194 subs r3, r3, #2
195 vswp d1, d2
196 \vhadd q0, q0, q1
197 vst1.64 {d0}, [r0,:64], r2
198 vst1.64 {d1}, [r0,:64], r2
199 bne 1b
200 bx lr
201 .endm
202
203 .macro pixels8_y2 vhadd=vrhadd.u8
d647ed78
DC
204 vld1.64 {d0}, [r1], r2
205 vld1.64 {d1}, [r1], r2
569f5a75
MR
2061: subs r3, r3, #2
207 \vhadd d4, d0, d1
d647ed78 208 vld1.64 {d0}, [r1], r2
569f5a75 209 \vhadd d5, d0, d1
d647ed78 210 vld1.64 {d1}, [r1], r2
569f5a75 211 pld [r1]
d647ed78 212 pld [r1, r2]
569f5a75
MR
213 vst1.64 {d4}, [r0,:64], r2
214 vst1.64 {d5}, [r0,:64], r2
215 bne 1b
d647ed78 216 bx lr
569f5a75
MR
217 .endm
218
219 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
d647ed78
DC
220 vld1.64 {d0, d1}, [r1], r2
221 vld1.64 {d2, d3}, [r1], r2
569f5a75
MR
222.if \no_rnd
223 vmov.i16 q11, #1
224.endif
225 pld [r1]
d647ed78 226 pld [r1, r2]
569f5a75
MR
227 vext.8 d4, d0, d1, #1
228 vext.8 d6, d2, d3, #1
229 vaddl.u8 q8, d0, d4
230 vaddl.u8 q9, d2, d6
2311: subs r3, r3, #2
d647ed78 232 vld1.64 {d0, d1}, [r1], r2
569f5a75
MR
233 pld [r1]
234 vadd.u16 q10, q8, q9
235 vext.8 d4, d0, d1, #1
236.if \no_rnd
237 vadd.u16 q10, q10, q11
238.endif
239 vaddl.u8 q8, d0, d4
240 \vshrn d5, q10, #2
d647ed78 241 vld1.64 {d2, d3}, [r1], r2
569f5a75 242 vadd.u16 q10, q8, q9
d647ed78 243 pld [r1, r2]
569f5a75
MR
244.if \no_rnd
245 vadd.u16 q10, q10, q11
246.endif
247 vst1.64 {d5}, [r0,:64], r2
248 \vshrn d7, q10, #2
249 vext.8 d6, d2, d3, #1
250 vaddl.u8 q9, d2, d6
251 vst1.64 {d7}, [r0,:64], r2
252 bgt 1b
d647ed78 253 bx lr
569f5a75
MR
254 .endm
255
256 .macro pixfunc pfx name suf rnd_op args:vararg
257function ff_\pfx\name\suf\()_neon, export=1
258 \name \rnd_op \args
a7e7d40c 259endfunc
569f5a75
MR
260 .endm
261
262 .macro pixfunc2 pfx name args:vararg
263 pixfunc \pfx \name
264 pixfunc \pfx \name \args
265 .endm
266
267function ff_put_h264_qpel16_mc00_neon, export=1
12bf71b6 268 mov r3, #16
a7e7d40c 269endfunc
569f5a75
MR
270
271 pixfunc put_ pixels16
272 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
273 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
274 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
275
276function ff_avg_h264_qpel16_mc00_neon, export=1
12bf71b6 277 mov r3, #16
a7e7d40c 278endfunc
569f5a75
MR
279
280 pixfunc avg_ pixels16,, 1
281
282function ff_put_h264_qpel8_mc00_neon, export=1
12bf71b6 283 mov r3, #8
a7e7d40c 284endfunc
569f5a75
MR
285
286 pixfunc put_ pixels8
287 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
288 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
289 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
f23740d9 290
bef966e3
MR
291function ff_avg_h264_qpel8_mc00_neon, export=1
292 mov r3, #8
a7e7d40c 293endfunc
bef966e3
MR
294
295 pixfunc avg_ pixels8,, 1
296
428bf2ac
DC
297function ff_put_pixels_clamped_neon, export=1
298 vld1.64 {d16-d19}, [r0,:128]!
299 vqmovun.s16 d0, q8
300 vld1.64 {d20-d23}, [r0,:128]!
301 vqmovun.s16 d1, q9
302 vld1.64 {d24-d27}, [r0,:128]!
303 vqmovun.s16 d2, q10
304 vld1.64 {d28-d31}, [r0,:128]!
305 vqmovun.s16 d3, q11
306 vst1.64 {d0}, [r1,:64], r2
307 vqmovun.s16 d4, q12
308 vst1.64 {d1}, [r1,:64], r2
309 vqmovun.s16 d5, q13
310 vst1.64 {d2}, [r1,:64], r2
311 vqmovun.s16 d6, q14
312 vst1.64 {d3}, [r1,:64], r2
313 vqmovun.s16 d7, q15
314 vst1.64 {d4}, [r1,:64], r2
315 vst1.64 {d5}, [r1,:64], r2
316 vst1.64 {d6}, [r1,:64], r2
317 vst1.64 {d7}, [r1,:64], r2
318 bx lr
a7e7d40c 319endfunc
428bf2ac 320
cc2e5554
DC
321function ff_put_signed_pixels_clamped_neon, export=1
322 vmov.u8 d31, #128
323 vld1.64 {d16-d17}, [r0,:128]!
324 vqmovn.s16 d0, q8
325 vld1.64 {d18-d19}, [r0,:128]!
326 vqmovn.s16 d1, q9
327 vld1.64 {d16-d17}, [r0,:128]!
328 vqmovn.s16 d2, q8
329 vld1.64 {d18-d19}, [r0,:128]!
330 vadd.u8 d0, d0, d31
331 vld1.64 {d20-d21}, [r0,:128]!
332 vadd.u8 d1, d1, d31
333 vld1.64 {d22-d23}, [r0,:128]!
334 vadd.u8 d2, d2, d31
335 vst1.64 {d0}, [r1,:64], r2
336 vqmovn.s16 d3, q9
337 vst1.64 {d1}, [r1,:64], r2
338 vqmovn.s16 d4, q10
339 vst1.64 {d2}, [r1,:64], r2
340 vqmovn.s16 d5, q11
341 vld1.64 {d24-d25}, [r0,:128]!
342 vadd.u8 d3, d3, d31
343 vld1.64 {d26-d27}, [r0,:128]!
344 vadd.u8 d4, d4, d31
345 vadd.u8 d5, d5, d31
346 vst1.64 {d3}, [r1,:64], r2
347 vqmovn.s16 d6, q12
348 vst1.64 {d4}, [r1,:64], r2
349 vqmovn.s16 d7, q13
350 vst1.64 {d5}, [r1,:64], r2
351 vadd.u8 d6, d6, d31
352 vadd.u8 d7, d7, d31
353 vst1.64 {d6}, [r1,:64], r2
354 vst1.64 {d7}, [r1,:64], r2
355 bx lr
a7e7d40c 356endfunc
cc2e5554 357
08e12b22
MR
358function ff_add_pixels_clamped_neon, export=1
359 mov r3, r1
360 vld1.64 {d16}, [r1,:64], r2
361 vld1.64 {d0-d1}, [r0,:128]!
362 vaddw.u8 q0, q0, d16
363 vld1.64 {d17}, [r1,:64], r2
364 vld1.64 {d2-d3}, [r0,:128]!
365 vqmovun.s16 d0, q0
366 vld1.64 {d18}, [r1,:64], r2
367 vaddw.u8 q1, q1, d17
368 vld1.64 {d4-d5}, [r0,:128]!
369 vaddw.u8 q2, q2, d18
370 vst1.64 {d0}, [r3,:64], r2
371 vqmovun.s16 d2, q1
372 vld1.64 {d19}, [r1,:64], r2
373 vld1.64 {d6-d7}, [r0,:128]!
374 vaddw.u8 q3, q3, d19
375 vqmovun.s16 d4, q2
376 vst1.64 {d2}, [r3,:64], r2
377 vld1.64 {d16}, [r1,:64], r2
378 vqmovun.s16 d6, q3
379 vld1.64 {d0-d1}, [r0,:128]!
380 vaddw.u8 q0, q0, d16
381 vst1.64 {d4}, [r3,:64], r2
382 vld1.64 {d17}, [r1,:64], r2
383 vld1.64 {d2-d3}, [r0,:128]!
384 vaddw.u8 q1, q1, d17
385 vst1.64 {d6}, [r3,:64], r2
386 vqmovun.s16 d0, q0
387 vld1.64 {d18}, [r1,:64], r2
388 vld1.64 {d4-d5}, [r0,:128]!
389 vaddw.u8 q2, q2, d18
390 vst1.64 {d0}, [r3,:64], r2
391 vqmovun.s16 d2, q1
392 vld1.64 {d19}, [r1,:64], r2
393 vqmovun.s16 d4, q2
394 vld1.64 {d6-d7}, [r0,:128]!
395 vaddw.u8 q3, q3, d19
396 vst1.64 {d2}, [r3,:64], r2
397 vqmovun.s16 d6, q3
398 vst1.64 {d4}, [r3,:64], r2
399 vst1.64 {d6}, [r3,:64], r2
400 bx lr
a7e7d40c 401endfunc
08e12b22 402
f23740d9
MR
403function ff_float_to_int16_neon, export=1
404 subs r2, r2, #8
405 vld1.64 {d0-d1}, [r1,:128]!
406 vcvt.s32.f32 q8, q0, #16
407 vld1.64 {d2-d3}, [r1,:128]!
408 vcvt.s32.f32 q9, q1, #16
409 beq 3f
410 bics ip, r2, #15
411 beq 2f
4121: subs ip, ip, #16
413 vshrn.s32 d4, q8, #16
414 vld1.64 {d0-d1}, [r1,:128]!
415 vcvt.s32.f32 q0, q0, #16
416 vshrn.s32 d5, q9, #16
417 vld1.64 {d2-d3}, [r1,:128]!
418 vcvt.s32.f32 q1, q1, #16
419 vshrn.s32 d6, q0, #16
420 vst1.64 {d4-d5}, [r0,:128]!
421 vshrn.s32 d7, q1, #16
422 vld1.64 {d16-d17},[r1,:128]!
423 vcvt.s32.f32 q8, q8, #16
424 vld1.64 {d18-d19},[r1,:128]!
425 vcvt.s32.f32 q9, q9, #16
426 vst1.64 {d6-d7}, [r0,:128]!
427 bne 1b
428 ands r2, r2, #15
429 beq 3f
4302: vld1.64 {d0-d1}, [r1,:128]!
431 vshrn.s32 d4, q8, #16
432 vcvt.s32.f32 q0, q0, #16
433 vld1.64 {d2-d3}, [r1,:128]!
434 vshrn.s32 d5, q9, #16
435 vcvt.s32.f32 q1, q1, #16
436 vshrn.s32 d6, q0, #16
437 vst1.64 {d4-d5}, [r0,:128]!
438 vshrn.s32 d7, q1, #16
439 vst1.64 {d6-d7}, [r0,:128]!
440 bx lr
4413: vshrn.s32 d4, q8, #16
442 vshrn.s32 d5, q9, #16
443 vst1.64 {d4-d5}, [r0,:128]!
444 bx lr
a7e7d40c 445endfunc
f23740d9
MR
446
447function ff_float_to_int16_interleave_neon, export=1
448 cmp r3, #2
449 ldrlt r1, [r1]
450 blt ff_float_to_int16_neon
451 bne 4f
452
453 ldr r3, [r1]
454 ldr r1, [r1, #4]
455
456 subs r2, r2, #8
457 vld1.64 {d0-d1}, [r3,:128]!
458 vcvt.s32.f32 q8, q0, #16
459 vld1.64 {d2-d3}, [r3,:128]!
460 vcvt.s32.f32 q9, q1, #16
461 vld1.64 {d20-d21},[r1,:128]!
462 vcvt.s32.f32 q10, q10, #16
463 vld1.64 {d22-d23},[r1,:128]!
464 vcvt.s32.f32 q11, q11, #16
465 beq 3f
466 bics ip, r2, #15
467 beq 2f
4681: subs ip, ip, #16
469 vld1.64 {d0-d1}, [r3,:128]!
470 vcvt.s32.f32 q0, q0, #16
471 vsri.32 q10, q8, #16
472 vld1.64 {d2-d3}, [r3,:128]!
473 vcvt.s32.f32 q1, q1, #16
474 vld1.64 {d24-d25},[r1,:128]!
475 vcvt.s32.f32 q12, q12, #16
476 vld1.64 {d26-d27},[r1,:128]!
477 vsri.32 q11, q9, #16
478 vst1.64 {d20-d21},[r0,:128]!
479 vcvt.s32.f32 q13, q13, #16
480 vst1.64 {d22-d23},[r0,:128]!
481 vsri.32 q12, q0, #16
482 vld1.64 {d16-d17},[r3,:128]!
483 vsri.32 q13, q1, #16
484 vst1.64 {d24-d25},[r0,:128]!
485 vcvt.s32.f32 q8, q8, #16
486 vld1.64 {d18-d19},[r3,:128]!
487 vcvt.s32.f32 q9, q9, #16
488 vld1.64 {d20-d21},[r1,:128]!
489 vcvt.s32.f32 q10, q10, #16
490 vld1.64 {d22-d23},[r1,:128]!
491 vcvt.s32.f32 q11, q11, #16
492 vst1.64 {d26-d27},[r0,:128]!
493 bne 1b
494 ands r2, r2, #15
495 beq 3f
4962: vsri.32 q10, q8, #16
497 vld1.64 {d0-d1}, [r3,:128]!
498 vcvt.s32.f32 q0, q0, #16
499 vld1.64 {d2-d3}, [r3,:128]!
500 vcvt.s32.f32 q1, q1, #16
501 vld1.64 {d24-d25},[r1,:128]!
502 vcvt.s32.f32 q12, q12, #16
503 vsri.32 q11, q9, #16
504 vld1.64 {d26-d27},[r1,:128]!
505 vcvt.s32.f32 q13, q13, #16
506 vst1.64 {d20-d21},[r0,:128]!
507 vsri.32 q12, q0, #16
508 vst1.64 {d22-d23},[r0,:128]!
509 vsri.32 q13, q1, #16
510 vst1.64 {d24-d27},[r0,:128]!
511 bx lr
5123: vsri.32 q10, q8, #16
513 vsri.32 q11, q9, #16
514 vst1.64 {d20-d23},[r0,:128]!
515 bx lr
516
5174: push {r4-r8,lr}
518 cmp r3, #4
519 lsl ip, r3, #1
520 blt 4f
521
522 @ 4 channels
5235: ldmia r1!, {r4-r7}
524 mov lr, r2
525 mov r8, r0
526 vld1.64 {d16-d17},[r4,:128]!
527 vcvt.s32.f32 q8, q8, #16
528 vld1.64 {d18-d19},[r5,:128]!
529 vcvt.s32.f32 q9, q9, #16
530 vld1.64 {d20-d21},[r6,:128]!
531 vcvt.s32.f32 q10, q10, #16
532 vld1.64 {d22-d23},[r7,:128]!
533 vcvt.s32.f32 q11, q11, #16
5346: subs lr, lr, #8
535 vld1.64 {d0-d1}, [r4,:128]!
536 vcvt.s32.f32 q0, q0, #16
537 vsri.32 q9, q8, #16
538 vld1.64 {d2-d3}, [r5,:128]!
539 vcvt.s32.f32 q1, q1, #16
540 vsri.32 q11, q10, #16
541 vld1.64 {d4-d5}, [r6,:128]!
542 vcvt.s32.f32 q2, q2, #16
543 vzip.32 d18, d22
544 vld1.64 {d6-d7}, [r7,:128]!
545 vcvt.s32.f32 q3, q3, #16
546 vzip.32 d19, d23
547 vst1.64 {d18}, [r8], ip
548 vsri.32 q1, q0, #16
549 vst1.64 {d22}, [r8], ip
550 vsri.32 q3, q2, #16
551 vst1.64 {d19}, [r8], ip
552 vzip.32 d2, d6
553 vst1.64 {d23}, [r8], ip
554 vzip.32 d3, d7
555 beq 7f
556 vld1.64 {d16-d17},[r4,:128]!
557 vcvt.s32.f32 q8, q8, #16
558 vst1.64 {d2}, [r8], ip
559 vld1.64 {d18-d19},[r5,:128]!
560 vcvt.s32.f32 q9, q9, #16
561 vst1.64 {d6}, [r8], ip
562 vld1.64 {d20-d21},[r6,:128]!
563 vcvt.s32.f32 q10, q10, #16
564 vst1.64 {d3}, [r8], ip
565 vld1.64 {d22-d23},[r7,:128]!
566 vcvt.s32.f32 q11, q11, #16
567 vst1.64 {d7}, [r8], ip
568 b 6b
5697: vst1.64 {d2}, [r8], ip
570 vst1.64 {d6}, [r8], ip
571 vst1.64 {d3}, [r8], ip
572 vst1.64 {d7}, [r8], ip
573 subs r3, r3, #4
574 popeq {r4-r8,pc}
575 cmp r3, #4
576 add r0, r0, #8
577 bge 5b
578
579 @ 2 channels
5804: cmp r3, #2
581 blt 4f
582 ldmia r1!, {r4-r5}
583 mov lr, r2
584 mov r8, r0
585 tst lr, #8
586 vld1.64 {d16-d17},[r4,:128]!
587 vcvt.s32.f32 q8, q8, #16
588 vld1.64 {d18-d19},[r5,:128]!
589 vcvt.s32.f32 q9, q9, #16
590 vld1.64 {d20-d21},[r4,:128]!
591 vcvt.s32.f32 q10, q10, #16
592 vld1.64 {d22-d23},[r5,:128]!
593 vcvt.s32.f32 q11, q11, #16
594 beq 6f
595 subs lr, lr, #8
596 beq 7f
597 vsri.32 d18, d16, #16
598 vsri.32 d19, d17, #16
599 vld1.64 {d16-d17},[r4,:128]!
600 vcvt.s32.f32 q8, q8, #16
601 vst1.32 {d18[0]}, [r8], ip
602 vsri.32 d22, d20, #16
603 vst1.32 {d18[1]}, [r8], ip
604 vsri.32 d23, d21, #16
605 vst1.32 {d19[0]}, [r8], ip
606 vst1.32 {d19[1]}, [r8], ip
607 vld1.64 {d18-d19},[r5,:128]!
608 vcvt.s32.f32 q9, q9, #16
609 vst1.32 {d22[0]}, [r8], ip
610 vst1.32 {d22[1]}, [r8], ip
611 vld1.64 {d20-d21},[r4,:128]!
612 vcvt.s32.f32 q10, q10, #16
613 vst1.32 {d23[0]}, [r8], ip
614 vst1.32 {d23[1]}, [r8], ip
615 vld1.64 {d22-d23},[r5,:128]!
616 vcvt.s32.f32 q11, q11, #16
6176: subs lr, lr, #16
618 vld1.64 {d0-d1}, [r4,:128]!
619 vcvt.s32.f32 q0, q0, #16
620 vsri.32 d18, d16, #16
621 vld1.64 {d2-d3}, [r5,:128]!
622 vcvt.s32.f32 q1, q1, #16
623 vsri.32 d19, d17, #16
624 vld1.64 {d4-d5}, [r4,:128]!
625 vcvt.s32.f32 q2, q2, #16
626 vld1.64 {d6-d7}, [r5,:128]!
627 vcvt.s32.f32 q3, q3, #16
628 vst1.32 {d18[0]}, [r8], ip
629 vsri.32 d22, d20, #16
630 vst1.32 {d18[1]}, [r8], ip
631 vsri.32 d23, d21, #16
632 vst1.32 {d19[0]}, [r8], ip
633 vsri.32 d2, d0, #16
634 vst1.32 {d19[1]}, [r8], ip
635 vsri.32 d3, d1, #16
636 vst1.32 {d22[0]}, [r8], ip
637 vsri.32 d6, d4, #16
638 vst1.32 {d22[1]}, [r8], ip
639 vsri.32 d7, d5, #16
640 vst1.32 {d23[0]}, [r8], ip
641 vst1.32 {d23[1]}, [r8], ip
642 beq 6f
643 vld1.64 {d16-d17},[r4,:128]!
644 vcvt.s32.f32 q8, q8, #16
645 vst1.32 {d2[0]}, [r8], ip
646 vst1.32 {d2[1]}, [r8], ip
647 vld1.64 {d18-d19},[r5,:128]!
648 vcvt.s32.f32 q9, q9, #16
649 vst1.32 {d3[0]}, [r8], ip
650 vst1.32 {d3[1]}, [r8], ip
651 vld1.64 {d20-d21},[r4,:128]!
652 vcvt.s32.f32 q10, q10, #16
653 vst1.32 {d6[0]}, [r8], ip
654 vst1.32 {d6[1]}, [r8], ip
655 vld1.64 {d22-d23},[r5,:128]!
656 vcvt.s32.f32 q11, q11, #16
657 vst1.32 {d7[0]}, [r8], ip
658 vst1.32 {d7[1]}, [r8], ip
659 bgt 6b
6606: vst1.32 {d2[0]}, [r8], ip
661 vst1.32 {d2[1]}, [r8], ip
662 vst1.32 {d3[0]}, [r8], ip
663 vst1.32 {d3[1]}, [r8], ip
664 vst1.32 {d6[0]}, [r8], ip
665 vst1.32 {d6[1]}, [r8], ip
666 vst1.32 {d7[0]}, [r8], ip
667 vst1.32 {d7[1]}, [r8], ip
668 b 8f
6697: vsri.32 d18, d16, #16
670 vsri.32 d19, d17, #16
671 vst1.32 {d18[0]}, [r8], ip
672 vsri.32 d22, d20, #16
673 vst1.32 {d18[1]}, [r8], ip
674 vsri.32 d23, d21, #16
675 vst1.32 {d19[0]}, [r8], ip
676 vst1.32 {d19[1]}, [r8], ip
677 vst1.32 {d22[0]}, [r8], ip
678 vst1.32 {d22[1]}, [r8], ip
679 vst1.32 {d23[0]}, [r8], ip
680 vst1.32 {d23[1]}, [r8], ip
6818: subs r3, r3, #2
682 add r0, r0, #4
683 popeq {r4-r8,pc}
684
685 @ 1 channel
6864: ldr r4, [r1],#4
687 tst r2, #8
688 mov lr, r2
689 mov r5, r0
690 vld1.64 {d0-d1}, [r4,:128]!
691 vcvt.s32.f32 q0, q0, #16
692 vld1.64 {d2-d3}, [r4,:128]!
693 vcvt.s32.f32 q1, q1, #16
694 bne 8f
6956: subs lr, lr, #16
696 vld1.64 {d4-d5}, [r4,:128]!
697 vcvt.s32.f32 q2, q2, #16
698 vld1.64 {d6-d7}, [r4,:128]!
699 vcvt.s32.f32 q3, q3, #16
700 vst1.16 {d0[1]}, [r5,:16], ip
701 vst1.16 {d0[3]}, [r5,:16], ip
702 vst1.16 {d1[1]}, [r5,:16], ip
703 vst1.16 {d1[3]}, [r5,:16], ip
704 vst1.16 {d2[1]}, [r5,:16], ip
705 vst1.16 {d2[3]}, [r5,:16], ip
706 vst1.16 {d3[1]}, [r5,:16], ip
707 vst1.16 {d3[3]}, [r5,:16], ip
708 beq 7f
709 vld1.64 {d0-d1}, [r4,:128]!
710 vcvt.s32.f32 q0, q0, #16
711 vld1.64 {d2-d3}, [r4,:128]!
712 vcvt.s32.f32 q1, q1, #16
7137: vst1.16 {d4[1]}, [r5,:16], ip
714 vst1.16 {d4[3]}, [r5,:16], ip
715 vst1.16 {d5[1]}, [r5,:16], ip
716 vst1.16 {d5[3]}, [r5,:16], ip
717 vst1.16 {d6[1]}, [r5,:16], ip
718 vst1.16 {d6[3]}, [r5,:16], ip
719 vst1.16 {d7[1]}, [r5,:16], ip
720 vst1.16 {d7[3]}, [r5,:16], ip
721 bgt 6b
722 pop {r4-r8,pc}
7238: subs lr, lr, #8
724 vst1.16 {d0[1]}, [r5,:16], ip
725 vst1.16 {d0[3]}, [r5,:16], ip
726 vst1.16 {d1[1]}, [r5,:16], ip
727 vst1.16 {d1[3]}, [r5,:16], ip
728 vst1.16 {d2[1]}, [r5,:16], ip
729 vst1.16 {d2[3]}, [r5,:16], ip
730 vst1.16 {d3[1]}, [r5,:16], ip
731 vst1.16 {d3[3]}, [r5,:16], ip
732 popeq {r4-r8,pc}
733 vld1.64 {d0-d1}, [r4,:128]!
734 vcvt.s32.f32 q0, q0, #16
735 vld1.64 {d2-d3}, [r4,:128]!
736 vcvt.s32.f32 q1, q1, #16
737 b 6b
a7e7d40c 738endfunc
dd927e2e
MR
739
740function ff_vector_fmul_neon, export=1
741 mov r3, r0
742 subs r2, r2, #8
743 vld1.64 {d0-d3}, [r0,:128]!
744 vld1.64 {d4-d7}, [r1,:128]!
745 vmul.f32 q8, q0, q2
746 vmul.f32 q9, q1, q3
747 beq 3f
748 bics ip, r2, #15
749 beq 2f
7501: subs ip, ip, #16
751 vld1.64 {d0-d1}, [r0,:128]!
752 vld1.64 {d4-d5}, [r1,:128]!
753 vmul.f32 q10, q0, q2
754 vld1.64 {d2-d3}, [r0,:128]!
755 vld1.64 {d6-d7}, [r1,:128]!
756 vmul.f32 q11, q1, q3
757 vst1.64 {d16-d19},[r3,:128]!
758 vld1.64 {d0-d1}, [r0,:128]!
759 vld1.64 {d4-d5}, [r1,:128]!
760 vmul.f32 q8, q0, q2
761 vld1.64 {d2-d3}, [r0,:128]!
762 vld1.64 {d6-d7}, [r1,:128]!
763 vmul.f32 q9, q1, q3
764 vst1.64 {d20-d23},[r3,:128]!
765 bne 1b
766 ands r2, r2, #15
767 beq 3f
7682: vld1.64 {d0-d1}, [r0,:128]!
769 vld1.64 {d4-d5}, [r1,:128]!
770 vst1.64 {d16-d17},[r3,:128]!
771 vmul.f32 q8, q0, q2
772 vld1.64 {d2-d3}, [r0,:128]!
773 vld1.64 {d6-d7}, [r1,:128]!
774 vst1.64 {d18-d19},[r3,:128]!
775 vmul.f32 q9, q1, q3
7763: vst1.64 {d16-d19},[r3,:128]!
777 bx lr
a7e7d40c 778endfunc
e1f7cb7f
MR
779
780function ff_vector_fmul_window_neon, export=1
0a8958c8
MR
781VFP vdup.32 q8, d0[0]
782NOVFP vld1.32 {d16[],d17[]}, [sp,:32]
e1f7cb7f 783 push {r4,r5,lr}
0a8958c8
MR
784VFP ldr lr, [sp, #12]
785NOVFP ldr lr, [sp, #16]
e1f7cb7f
MR
786 sub r2, r2, #8
787 sub r5, lr, #2
788 add r2, r2, r5, lsl #2
789 add r4, r3, r5, lsl #3
790 add ip, r0, r5, lsl #3
791 mov r5, #-16
792 vld1.64 {d0,d1}, [r1,:128]!
793 vld1.64 {d2,d3}, [r2,:128], r5
794 vld1.64 {d4,d5}, [r3,:128]!
795 vld1.64 {d6,d7}, [r4,:128], r5
7961: subs lr, lr, #4
797 vmov q11, q8
798 vmla.f32 d22, d0, d4
799 vmov q10, q8
800 vmla.f32 d23, d1, d5
801 vrev64.32 q3, q3
802 vmla.f32 d20, d0, d7
803 vrev64.32 q1, q1
804 vmla.f32 d21, d1, d6
805 beq 2f
806 vmla.f32 d22, d3, d7
807 vld1.64 {d0,d1}, [r1,:128]!
808 vmla.f32 d23, d2, d6
809 vld1.64 {d18,d19},[r2,:128], r5
810 vmls.f32 d20, d3, d4
811 vld1.64 {d24,d25},[r3,:128]!
812 vmls.f32 d21, d2, d5
813 vld1.64 {d6,d7}, [r4,:128], r5
814 vmov q1, q9
815 vrev64.32 q11, q11
816 vmov q2, q12
817 vswp d22, d23
818 vst1.64 {d20,d21},[r0,:128]!
819 vst1.64 {d22,d23},[ip,:128], r5
820 b 1b
8212: vmla.f32 d22, d3, d7
822 vmla.f32 d23, d2, d6
823 vmls.f32 d20, d3, d4
824 vmls.f32 d21, d2, d5
825 vrev64.32 q11, q11
826 vswp d22, d23
827 vst1.64 {d20,d21},[r0,:128]!
828 vst1.64 {d22,d23},[ip,:128], r5
829 pop {r4,r5,pc}
a7e7d40c 830endfunc
e814015d
MR
831
832#if CONFIG_VORBIS_DECODER
833function ff_vorbis_inverse_coupling_neon, export=1
834 vmov.i32 q10, #1<<31
835 subs r2, r2, #4
e814015d
MR
836 mov r3, r0
837 mov r12, r1
838 beq 3f
839
840 vld1.32 {d24-d25},[r1,:128]!
841 vld1.32 {d22-d23},[r0,:128]!
842 vcle.s32 q8, q12, #0
843 vand q9, q11, q10
844 veor q12, q12, q9
845 vand q2, q12, q8
846 vbic q3, q12, q8
847 vadd.f32 q12, q11, q2
848 vsub.f32 q11, q11, q3
8491: vld1.32 {d2-d3}, [r1,:128]!
850 vld1.32 {d0-d1}, [r0,:128]!
851 vcle.s32 q8, q1, #0
852 vand q9, q0, q10
853 veor q1, q1, q9
854 vst1.32 {d24-d25},[r3, :128]!
855 vst1.32 {d22-d23},[r12,:128]!
856 vand q2, q1, q8
857 vbic q3, q1, q8
858 vadd.f32 q1, q0, q2
859 vsub.f32 q0, q0, q3
860 subs r2, r2, #8
861 ble 2f
862 vld1.32 {d24-d25},[r1,:128]!
863 vld1.32 {d22-d23},[r0,:128]!
864 vcle.s32 q8, q12, #0
865 vand q9, q11, q10
866 veor q12, q12, q9
867 vst1.32 {d2-d3}, [r3, :128]!
868 vst1.32 {d0-d1}, [r12,:128]!
869 vand q2, q12, q8
870 vbic q3, q12, q8
871 vadd.f32 q12, q11, q2
872 vsub.f32 q11, q11, q3
873 b 1b
874
8752: vst1.32 {d2-d3}, [r3, :128]!
876 vst1.32 {d0-d1}, [r12,:128]!
877 bxlt lr
878
8793: vld1.32 {d2-d3}, [r1,:128]
880 vld1.32 {d0-d1}, [r0,:128]
881 vcle.s32 q8, q1, #0
882 vand q9, q0, q10
883 veor q1, q1, q9
884 vand q2, q1, q8
885 vbic q3, q1, q8
886 vadd.f32 q1, q0, q2
887 vsub.f32 q0, q0, q3
888 vst1.32 {d2-d3}, [r0,:128]!
889 vst1.32 {d0-d1}, [r1,:128]!
890 bx lr
a7e7d40c 891endfunc
e814015d 892#endif
1dee3e97
MR
893
894function ff_vector_fmul_scalar_neon, export=1
895VFP len .req r2
896NOVFP len .req r3
897VFP vdup.32 q8, d0[0]
898NOVFP vdup.32 q8, r2
899 bics r12, len, #15
900 beq 3f
901 vld1.32 {q0},[r1,:128]!
902 vld1.32 {q1},[r1,:128]!
9031: vmul.f32 q0, q0, q8
904 vld1.32 {q2},[r1,:128]!
905 vmul.f32 q1, q1, q8
906 vld1.32 {q3},[r1,:128]!
907 vmul.f32 q2, q2, q8
908 vst1.32 {q0},[r0,:128]!
909 vmul.f32 q3, q3, q8
910 vst1.32 {q1},[r0,:128]!
911 subs r12, r12, #16
912 beq 2f
913 vld1.32 {q0},[r1,:128]!
914 vst1.32 {q2},[r0,:128]!
915 vld1.32 {q1},[r1,:128]!
916 vst1.32 {q3},[r0,:128]!
917 b 1b
9182: vst1.32 {q2},[r0,:128]!
919 vst1.32 {q3},[r0,:128]!
920 ands len, len, #15
921 bxeq lr
9223: vld1.32 {q0},[r1,:128]!
923 vmul.f32 q0, q0, q8
924 vst1.32 {q0},[r0,:128]!
925 subs len, len, #4
926 bgt 3b
927 bx lr
928 .unreq len
a7e7d40c 929endfunc
1dee3e97
MR
930
931function ff_vector_fmul_sv_scalar_2_neon, export=1
932VFP vdup.32 d16, d0[0]
933NOVFP vdup.32 d16, r3
934NOVFP ldr r3, [sp]
935 vld1.32 {d0},[r1,:64]!
936 vld1.32 {d1},[r1,:64]!
9371: subs r3, r3, #4
938 vmul.f32 d4, d0, d16
939 vmul.f32 d5, d1, d16
940 ldr r12, [r2], #4
941 vld1.32 {d2},[r12,:64]
942 ldr r12, [r2], #4
943 vld1.32 {d3},[r12,:64]
944 vmul.f32 d4, d4, d2
945 vmul.f32 d5, d5, d3
946 beq 2f
947 vld1.32 {d0},[r1,:64]!
948 vld1.32 {d1},[r1,:64]!
949 vst1.32 {d4},[r0,:64]!
950 vst1.32 {d5},[r0,:64]!
951 b 1b
9522: vst1.32 {d4},[r0,:64]!
953 vst1.32 {d5},[r0,:64]!
954 bx lr
a7e7d40c 955endfunc
1dee3e97
MR
956
957function ff_vector_fmul_sv_scalar_4_neon, export=1
958VFP vdup.32 q10, d0[0]
959NOVFP vdup.32 q10, r3
960NOVFP ldr r3, [sp]
961 push {lr}
962 bics lr, r3, #7
963 beq 3f
964 vld1.32 {q0},[r1,:128]!
965 vld1.32 {q2},[r1,:128]!
9661: ldr r12, [r2], #4
967 vld1.32 {q1},[r12,:128]
968 ldr r12, [r2], #4
969 vld1.32 {q3},[r12,:128]
970 vmul.f32 q8, q0, q10
971 vmul.f32 q8, q8, q1
972 vmul.f32 q9, q2, q10
973 vmul.f32 q9, q9, q3
974 subs lr, lr, #8
975 beq 2f
976 vld1.32 {q0},[r1,:128]!
977 vld1.32 {q2},[r1,:128]!
978 vst1.32 {q8},[r0,:128]!
979 vst1.32 {q9},[r0,:128]!
980 b 1b
9812: vst1.32 {q8},[r0,:128]!
982 vst1.32 {q9},[r0,:128]!
983 ands r3, r3, #7
984 popeq {pc}
9853: vld1.32 {q0},[r1,:128]!
986 ldr r12, [r2], #4
987 vld1.32 {q1},[r12,:128]
988 vmul.f32 q0, q0, q10
989 vmul.f32 q0, q0, q1
990 vst1.32 {q0},[r0,:128]!
991 subs r3, r3, #4
992 bgt 3b
993 pop {pc}
a7e7d40c 994endfunc
1dee3e97
MR
995
996function ff_sv_fmul_scalar_2_neon, export=1
997VFP len .req r2
998NOVFP len .req r3
999VFP vdup.32 q8, d0[0]
1000NOVFP vdup.32 q8, r2
1001 ldr r12, [r1], #4
1002 vld1.32 {d0},[r12,:64]
1003 ldr r12, [r1], #4
1004 vld1.32 {d1},[r12,:64]
10051: vmul.f32 q1, q0, q8
1006 subs len, len, #4
1007 beq 2f
1008 ldr r12, [r1], #4
1009 vld1.32 {d0},[r12,:64]
1010 ldr r12, [r1], #4
1011 vld1.32 {d1},[r12,:64]
1012 vst1.32 {q1},[r0,:128]!
1013 b 1b
10142: vst1.32 {q1},[r0,:128]!
1015 bx lr
1016 .unreq len
a7e7d40c 1017endfunc
1dee3e97
MR
1018
1019function ff_sv_fmul_scalar_4_neon, export=1
1020VFP len .req r2
1021NOVFP len .req r3
1022VFP vdup.32 q8, d0[0]
1023NOVFP vdup.32 q8, r2
10241: ldr r12, [r1], #4
1025 vld1.32 {q0},[r12,:128]
1026 vmul.f32 q0, q0, q8
1027 vst1.32 {q0},[r0,:128]!
1028 subs len, len, #4
1029 bgt 1b
1030 bx lr
1031 .unreq len
a7e7d40c 1032endfunc
1dee3e97
MR
1033
1034function ff_butterflies_float_neon, export=1
10351: vld1.32 {q0},[r0,:128]
1036 vld1.32 {q1},[r1,:128]
1037 vsub.f32 q2, q0, q1
1038 vadd.f32 q1, q0, q1
1039 vst1.32 {q2},[r1,:128]!
1040 vst1.32 {q1},[r0,:128]!
1041 subs r2, r2, #4
1042 bgt 1b
1043 bx lr
a7e7d40c 1044endfunc
275cfd15
MR
1045
1046function ff_scalarproduct_float_neon, export=1
1047 vmov.f32 q2, #0.0
10481: vld1.32 {q0},[r0,:128]!
1049 vld1.32 {q1},[r1,:128]!
1050 vmla.f32 q2, q0, q1
1051 subs r2, r2, #4
1052 bgt 1b
1053 vadd.f32 d0, d4, d5
1054 vpadd.f32 d0, d0, d0
1055NOVFP vmov.32 r0, d0[0]
1056 bx lr
a7e7d40c 1057endfunc
9bda7f30
MR
1058
1059function ff_int32_to_float_fmul_scalar_neon, export=1
1060VFP vdup.32 q0, d0[0]
1061VFP len .req r2
1062NOVFP vdup.32 q0, r2
1063NOVFP len .req r3
1064
1065 vld1.32 {q1},[r1,:128]!
1066 vcvt.f32.s32 q3, q1
1067 vld1.32 {q2},[r1,:128]!
1068 vcvt.f32.s32 q8, q2
10691: subs len, len, #8
1070 pld [r1, #16]
1071 vmul.f32 q9, q3, q0
1072 vmul.f32 q10, q8, q0
1073 beq 2f
1074 vld1.32 {q1},[r1,:128]!
1075 vcvt.f32.s32 q3, q1
1076 vld1.32 {q2},[r1,:128]!
1077 vcvt.f32.s32 q8, q2
1078 vst1.32 {q9}, [r0,:128]!
1079 vst1.32 {q10},[r0,:128]!
1080 b 1b
10812: vst1.32 {q9}, [r0,:128]!
1082 vst1.32 {q10},[r0,:128]!
1083 bx lr
1084 .unreq len
a7e7d40c 1085endfunc
b9b1ad9c
MR
1086
1087function ff_vector_fmul_reverse_neon, export=1
1088 add r2, r2, r3, lsl #2
1089 sub r2, r2, #32
1090 mov r12, #-32
1091 vld1.32 {q0-q1}, [r1,:128]!
1092 vld1.32 {q2-q3}, [r2,:128], r12
10931: pld [r1, #32]
1094 vrev64.32 q3, q3
1095 vmul.f32 d16, d0, d7
1096 vmul.f32 d17, d1, d6
1097 pld [r2, #-32]
1098 vrev64.32 q2, q2
1099 vmul.f32 d18, d2, d5
1100 vmul.f32 d19, d3, d4
1101 subs r3, r3, #8
1102 beq 2f
1103 vld1.32 {q0-q1}, [r1,:128]!
1104 vld1.32 {q2-q3}, [r2,:128], r12
1105 vst1.32 {q8-q9}, [r0,:128]!
1106 b 1b
11072: vst1.32 {q8-q9}, [r0,:128]!
1108 bx lr
a7e7d40c 1109endfunc
f331cec4 1110
ec71a8e0
MR
1111function ff_vector_fmul_add_neon, export=1
1112 ldr r12, [sp]
1113 vld1.32 {q0-q1}, [r1,:128]!
1114 vld1.32 {q8-q9}, [r2,:128]!
1115 vld1.32 {q2-q3}, [r3,:128]!
1116 vmul.f32 q10, q0, q8
1117 vmul.f32 q11, q1, q9
11181: vadd.f32 q12, q2, q10
1119 vadd.f32 q13, q3, q11
1120 pld [r1, #16]
1121 pld [r2, #16]
1122 pld [r3, #16]
1123 subs r12, r12, #8
1124 beq 2f
1125 vld1.32 {q0}, [r1,:128]!
1126 vld1.32 {q8}, [r2,:128]!
1127 vmul.f32 q10, q0, q8
1128 vld1.32 {q1}, [r1,:128]!
1129 vld1.32 {q9}, [r2,:128]!
1130 vmul.f32 q11, q1, q9
1131 vld1.32 {q2-q3}, [r3,:128]!
1132 vst1.32 {q12-q13},[r0,:128]!
1133 b 1b
11342: vst1.32 {q12-q13},[r0,:128]!
1135 bx lr
a7e7d40c 1136endfunc
ec71a8e0 1137
f331cec4
MR
1138function ff_vector_clipf_neon, export=1
1139VFP vdup.32 q1, d0[1]
1140VFP vdup.32 q0, d0[0]
1141NOVFP vdup.32 q0, r2
1142NOVFP vdup.32 q1, r3
1143NOVFP ldr r2, [sp]
1144 vld1.f32 {q2},[r1,:128]!
1145 vmin.f32 q10, q2, q1
1146 vld1.f32 {q3},[r1,:128]!
1147 vmin.f32 q11, q3, q1
11481: vmax.f32 q8, q10, q0
1149 vmax.f32 q9, q11, q0
1150 subs r2, r2, #8
1151 beq 2f
1152 vld1.f32 {q2},[r1,:128]!
1153 vmin.f32 q10, q2, q1
1154 vld1.f32 {q3},[r1,:128]!
1155 vmin.f32 q11, q3, q1
1156 vst1.f32 {q8},[r0,:128]!
1157 vst1.f32 {q9},[r0,:128]!
1158 b 1b
11592: vst1.f32 {q8},[r0,:128]!
1160 vst1.f32 {q9},[r0,:128]!
1161 bx lr
a7e7d40c 1162endfunc