ARM: simplify ff_put/avg_h264_chroma_mc4/8_neon definitions, no code change
[libav.git] / libavcodec / arm / h264dsp_neon.S
CommitLineData
1cce897a
MR
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "asm.S"
22
23 .fpu neon
24
5813e05d
MR
25 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
26 vtrn.32 \r0, \r4
27 vtrn.32 \r1, \r5
28 vtrn.32 \r2, \r6
29 vtrn.32 \r3, \r7
30 vtrn.16 \r0, \r2
31 vtrn.16 \r1, \r3
32 vtrn.16 \r4, \r6
33 vtrn.16 \r5, \r7
34 vtrn.8 \r0, \r1
35 vtrn.8 \r2, \r3
36 vtrn.8 \r4, \r5
37 vtrn.8 \r6, \r7
38 .endm
39
40 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
41 vswp \r0, \r4
42 vswp \r1, \r5
43 vswp \r2, \r6
44 vswp \r3, \r7
45 .endm
46
47 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
48 vtrn.32 \r0, \r2
49 vtrn.32 \r1, \r3
50 vtrn.32 \r4, \r6
51 vtrn.32 \r5, \r7
52 vtrn.16 \r0, \r1
53 vtrn.16 \r2, \r3
54 vtrn.16 \r4, \r5
55 vtrn.16 \r6, \r7
56 .endm
57
1cce897a 58/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
77c45373
MR
59 .macro h264_chroma_mc8 type
60function ff_\type\()_h264_chroma_mc8_neon, export=1
1cce897a
MR
61 push {r4-r7, lr}
62 ldrd r4, [sp, #20]
77c45373 63.ifc \type,avg
1cce897a
MR
64 mov lr, r0
65.endif
66 pld [r1]
67 pld [r1, r2]
68
69 muls r7, r4, r5
70 rsb r6, r7, r5, lsl #3
71 rsb ip, r7, r4, lsl #3
72 sub r4, r7, r4, lsl #3
73 sub r4, r4, r5, lsl #3
74 add r4, r4, #64
75
76 beq 2f
77
78 add r5, r1, r2
79
80 vdup.8 d0, r4
81 lsl r4, r2, #1
82 vdup.8 d1, ip
83 vld1.64 {d4, d5}, [r1], r4
84 vdup.8 d2, r6
85 vld1.64 {d6, d7}, [r5], r4
86 vdup.8 d3, r7
87
88 vext.8 d5, d4, d5, #1
89 vext.8 d7, d6, d7, #1
90
911: pld [r5]
92 vmull.u8 q8, d4, d0
93 vmlal.u8 q8, d5, d1
94 vld1.64 {d4, d5}, [r1], r4
95 vmlal.u8 q8, d6, d2
96 vext.8 d5, d4, d5, #1
97 vmlal.u8 q8, d7, d3
98 vmull.u8 q9, d6, d0
99 subs r3, r3, #2
100 vmlal.u8 q9, d7, d1
101 vmlal.u8 q9, d4, d2
102 vmlal.u8 q9, d5, d3
103 vrshrn.u16 d16, q8, #6
104 vld1.64 {d6, d7}, [r5], r4
105 pld [r1]
106 vrshrn.u16 d17, q9, #6
77c45373 107.ifc \type,avg
1cce897a
MR
108 vld1.64 {d20}, [lr,:64], r2
109 vld1.64 {d21}, [lr,:64], r2
110 vrhadd.u8 q8, q8, q10
111.endif
112 vext.8 d7, d6, d7, #1
113 vst1.64 {d16}, [r0,:64], r2
114 vst1.64 {d17}, [r0,:64], r2
115 bgt 1b
116
117 pop {r4-r7, pc}
118
1192: tst r6, r6
120 add ip, ip, r6
121 vdup.8 d0, r4
122 vdup.8 d1, ip
123
124 beq 4f
125
126 add r5, r1, r2
127 lsl r4, r2, #1
128 vld1.64 {d4}, [r1], r4
129 vld1.64 {d6}, [r5], r4
130
1313: pld [r5]
132 vmull.u8 q8, d4, d0
133 vmlal.u8 q8, d6, d1
134 vld1.64 {d4}, [r1], r4
135 vmull.u8 q9, d6, d0
136 vmlal.u8 q9, d4, d1
137 vld1.64 {d6}, [r5], r4
138 vrshrn.u16 d16, q8, #6
139 vrshrn.u16 d17, q9, #6
77c45373 140.ifc \type,avg
1cce897a
MR
141 vld1.64 {d20}, [lr,:64], r2
142 vld1.64 {d21}, [lr,:64], r2
143 vrhadd.u8 q8, q8, q10
144.endif
145 subs r3, r3, #2
146 pld [r1]
147 vst1.64 {d16}, [r0,:64], r2
148 vst1.64 {d17}, [r0,:64], r2
149 bgt 3b
150
151 pop {r4-r7, pc}
152
1534: vld1.64 {d4, d5}, [r1], r2
154 vld1.64 {d6, d7}, [r1], r2
155 vext.8 d5, d4, d5, #1
156 vext.8 d7, d6, d7, #1
157
1585: pld [r1]
159 subs r3, r3, #2
160 vmull.u8 q8, d4, d0
161 vmlal.u8 q8, d5, d1
162 vld1.64 {d4, d5}, [r1], r2
163 vmull.u8 q9, d6, d0
164 vmlal.u8 q9, d7, d1
165 pld [r1]
166 vext.8 d5, d4, d5, #1
167 vrshrn.u16 d16, q8, #6
168 vrshrn.u16 d17, q9, #6
77c45373 169.ifc \type,avg
1cce897a
MR
170 vld1.64 {d20}, [lr,:64], r2
171 vld1.64 {d21}, [lr,:64], r2
172 vrhadd.u8 q8, q8, q10
173.endif
174 vld1.64 {d6, d7}, [r1], r2
175 vext.8 d7, d6, d7, #1
176 vst1.64 {d16}, [r0,:64], r2
177 vst1.64 {d17}, [r0,:64], r2
178 bgt 5b
179
180 pop {r4-r7, pc}
77c45373 181 .endfunc
1cce897a
MR
182 .endm
183
184/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
77c45373
MR
185 .macro h264_chroma_mc4 type
186function ff_\type\()_h264_chroma_mc4_neon, export=1
1cce897a
MR
187 push {r4-r7, lr}
188 ldrd r4, [sp, #20]
77c45373 189.ifc \type,avg
1cce897a
MR
190 mov lr, r0
191.endif
192 pld [r1]
193 pld [r1, r2]
194
195 muls r7, r4, r5
196 rsb r6, r7, r5, lsl #3
197 rsb ip, r7, r4, lsl #3
198 sub r4, r7, r4, lsl #3
199 sub r4, r4, r5, lsl #3
200 add r4, r4, #64
201
202 beq 2f
203
204 add r5, r1, r2
205
206 vdup.8 d0, r4
207 lsl r4, r2, #1
208 vdup.8 d1, ip
209 vld1.64 {d4}, [r1], r4
210 vdup.8 d2, r6
211 vld1.64 {d6}, [r5], r4
212 vdup.8 d3, r7
213
214 vext.8 d5, d4, d5, #1
215 vext.8 d7, d6, d7, #1
216 vtrn.32 d4, d5
217 vtrn.32 d6, d7
218
219 vtrn.32 d0, d1
220 vtrn.32 d2, d3
221
2221: pld [r5]
223 vmull.u8 q8, d4, d0
224 vmlal.u8 q8, d6, d2
225 vld1.64 {d4}, [r1], r4
226 vext.8 d5, d4, d5, #1
227 vtrn.32 d4, d5
228 vmull.u8 q9, d6, d0
229 vmlal.u8 q9, d4, d2
230 vld1.64 {d6}, [r5], r4
231 vadd.i16 d16, d16, d17
232 vadd.i16 d17, d18, d19
233 vrshrn.u16 d16, q8, #6
234 subs r3, r3, #2
235 pld [r1]
77c45373 236.ifc \type,avg
1cce897a
MR
237 vld1.32 {d20[0]}, [lr,:32], r2
238 vld1.32 {d20[1]}, [lr,:32], r2
239 vrhadd.u8 d16, d16, d20
240.endif
241 vext.8 d7, d6, d7, #1
242 vtrn.32 d6, d7
243 vst1.32 {d16[0]}, [r0,:32], r2
244 vst1.32 {d16[1]}, [r0,:32], r2
245 bgt 1b
246
247 pop {r4-r7, pc}
248
2492: tst r6, r6
250 add ip, ip, r6
251 vdup.8 d0, r4
252 vdup.8 d1, ip
253 vtrn.32 d0, d1
254
255 beq 4f
256
257 vext.32 d1, d0, d1, #1
258 add r5, r1, r2
259 lsl r4, r2, #1
260 vld1.32 {d4[0]}, [r1], r4
261 vld1.32 {d4[1]}, [r5], r4
262
2633: pld [r5]
264 vmull.u8 q8, d4, d0
265 vld1.32 {d4[0]}, [r1], r4
266 vmull.u8 q9, d4, d1
267 vld1.32 {d4[1]}, [r5], r4
268 vadd.i16 d16, d16, d17
269 vadd.i16 d17, d18, d19
270 vrshrn.u16 d16, q8, #6
77c45373 271.ifc \type,avg
1cce897a
MR
272 vld1.32 {d20[0]}, [lr,:32], r2
273 vld1.32 {d20[1]}, [lr,:32], r2
274 vrhadd.u8 d16, d16, d20
275.endif
276 subs r3, r3, #2
277 pld [r1]
278 vst1.32 {d16[0]}, [r0,:32], r2
279 vst1.32 {d16[1]}, [r0,:32], r2
280 bgt 3b
281
282 pop {r4-r7, pc}
283
2844: vld1.64 {d4}, [r1], r2
285 vld1.64 {d6}, [r1], r2
286 vext.8 d5, d4, d5, #1
287 vext.8 d7, d6, d7, #1
288 vtrn.32 d4, d5
289 vtrn.32 d6, d7
290
2915: vmull.u8 q8, d4, d0
292 vmull.u8 q9, d6, d0
293 subs r3, r3, #2
294 vld1.64 {d4}, [r1], r2
295 vext.8 d5, d4, d5, #1
296 vtrn.32 d4, d5
297 vadd.i16 d16, d16, d17
298 vadd.i16 d17, d18, d19
299 pld [r1]
300 vrshrn.u16 d16, q8, #6
77c45373 301.ifc \type,avg
1cce897a
MR
302 vld1.32 {d20[0]}, [lr,:32], r2
303 vld1.32 {d20[1]}, [lr,:32], r2
304 vrhadd.u8 d16, d16, d20
305.endif
306 vld1.64 {d6}, [r1], r2
307 vext.8 d7, d6, d7, #1
308 vtrn.32 d6, d7
309 pld [r1]
310 vst1.32 {d16[0]}, [r0,:32], r2
311 vst1.32 {d16[1]}, [r0,:32], r2
312 bgt 5b
313
314 pop {r4-r7, pc}
77c45373 315 .endfunc
1cce897a
MR
316 .endm
317
318 .text
319 .align
320
77c45373
MR
321 h264_chroma_mc8 put
322 h264_chroma_mc8 avg
323 h264_chroma_mc4 put
324 h264_chroma_mc4 avg
ad74a0f8
MR
325
326 /* H.264 loop filter */
327
328 .macro h264_loop_filter_start
329 ldr ip, [sp]
330 tst r2, r2
331 ldr ip, [ip]
332 tstne r3, r3
333 vmov.32 d24[0], ip
334 and ip, ip, ip, lsl #16
335 bxeq lr
336 ands ip, ip, ip, lsl #8
337 bxlt lr
338 .endm
339
340 .macro align_push_regs
341 and ip, sp, #15
342 add ip, ip, #32
343 sub sp, sp, ip
344 vst1.64 {d12-d15}, [sp,:128]
345 sub sp, sp, #32
346 vst1.64 {d8-d11}, [sp,:128]
347 .endm
348
349 .macro align_pop_regs
350 vld1.64 {d8-d11}, [sp,:128]!
351 vld1.64 {d12-d15}, [sp,:128], ip
352 .endm
353
354 .macro h264_loop_filter_luma
355 vdup.8 q11, r2 @ alpha
356 vmovl.u8 q12, d24
357 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
358 vmovl.u16 q12, d24
359 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
360 vsli.16 q12, q12, #8
361 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
362 vsli.32 q12, q12, #16
363 vclt.u8 q6, q6, q11 @ < alpha
364 vdup.8 q11, r3 @ beta
365 vclt.s8 q7, q12, #0
366 vclt.u8 q14, q14, q11 @ < beta
367 vclt.u8 q15, q15, q11 @ < beta
368 vbic q6, q6, q7
369 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
370 vand q6, q6, q14
371 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
372 vclt.u8 q4, q4, q11 @ < beta
373 vand q6, q6, q15
374 vclt.u8 q5, q5, q11 @ < beta
375 vand q4, q4, q6
376 vand q5, q5, q6
377 vand q12, q12, q6
378 vrhadd.u8 q14, q8, q0
379 vsub.i8 q6, q12, q4
380 vqadd.u8 q7, q9, q12
381 vhadd.u8 q10, q10, q14
382 vsub.i8 q6, q6, q5
383 vhadd.u8 q14, q2, q14
384 vmin.u8 q7, q7, q10
385 vqsub.u8 q11, q9, q12
386 vqadd.u8 q2, q1, q12
387 vmax.u8 q7, q7, q11
388 vqsub.u8 q11, q1, q12
389 vmin.u8 q14, q2, q14
390 vmovl.u8 q2, d0
391 vmax.u8 q14, q14, q11
392 vmovl.u8 q10, d1
393 vsubw.u8 q2, q2, d16
394 vsubw.u8 q10, q10, d17
395 vshl.i16 q2, q2, #2
396 vshl.i16 q10, q10, #2
397 vaddw.u8 q2, q2, d18
398 vaddw.u8 q10, q10, d19
399 vsubw.u8 q2, q2, d2
400 vsubw.u8 q10, q10, d3
401 vrshrn.i16 d4, q2, #3
402 vrshrn.i16 d5, q10, #3
403 vbsl q4, q7, q9
404 vbsl q5, q14, q1
405 vneg.s8 q7, q6
406 vmovl.u8 q14, d16
407 vmin.s8 q2, q2, q6
408 vmovl.u8 q6, d17
409 vmax.s8 q2, q2, q7
410 vmovl.u8 q11, d0
411 vmovl.u8 q12, d1
412 vaddw.s8 q14, q14, d4
413 vaddw.s8 q6, q6, d5
414 vsubw.s8 q11, q11, d4
415 vsubw.s8 q12, q12, d5
416 vqmovun.s16 d16, q14
417 vqmovun.s16 d17, q6
418 vqmovun.s16 d0, q11
419 vqmovun.s16 d1, q12
420 .endm
421
422function ff_h264_v_loop_filter_luma_neon, export=1
423 h264_loop_filter_start
424
425 vld1.64 {d0, d1}, [r0,:128], r1
426 vld1.64 {d2, d3}, [r0,:128], r1
427 vld1.64 {d4, d5}, [r0,:128], r1
428 sub r0, r0, r1, lsl #2
429 sub r0, r0, r1, lsl #1
430 vld1.64 {d20,d21}, [r0,:128], r1
431 vld1.64 {d18,d19}, [r0,:128], r1
432 vld1.64 {d16,d17}, [r0,:128], r1
433
434 align_push_regs
435
436 h264_loop_filter_luma
437
438 sub r0, r0, r1, lsl #1
439 vst1.64 {d8, d9}, [r0,:128], r1
440 vst1.64 {d16,d17}, [r0,:128], r1
441 vst1.64 {d0, d1}, [r0,:128], r1
442 vst1.64 {d10,d11}, [r0,:128]
443
444 align_pop_regs
445 bx lr
446 .endfunc
447
448function ff_h264_h_loop_filter_luma_neon, export=1
449 h264_loop_filter_start
450
451 sub r0, r0, #4
452 vld1.64 {d6}, [r0], r1
453 vld1.64 {d20}, [r0], r1
454 vld1.64 {d18}, [r0], r1
455 vld1.64 {d16}, [r0], r1
456 vld1.64 {d0}, [r0], r1
457 vld1.64 {d2}, [r0], r1
458 vld1.64 {d4}, [r0], r1
459 vld1.64 {d26}, [r0], r1
460 vld1.64 {d7}, [r0], r1
461 vld1.64 {d21}, [r0], r1
462 vld1.64 {d19}, [r0], r1
463 vld1.64 {d17}, [r0], r1
464 vld1.64 {d1}, [r0], r1
465 vld1.64 {d3}, [r0], r1
466 vld1.64 {d5}, [r0], r1
467 vld1.64 {d27}, [r0], r1
468
5813e05d 469 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
ad74a0f8
MR
470
471 align_push_regs
472 sub sp, sp, #16
473 vst1.64 {d4, d5}, [sp,:128]
474 sub sp, sp, #16
475 vst1.64 {d20,d21}, [sp,:128]
476
477 h264_loop_filter_luma
478
479 vld1.64 {d20,d21}, [sp,:128]!
480 vld1.64 {d4, d5}, [sp,:128]!
481
5813e05d 482 transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13
ad74a0f8
MR
483
484 sub r0, r0, r1, lsl #4
485 vst1.64 {d6}, [r0], r1
486 vst1.64 {d20}, [r0], r1
487 vst1.64 {d8}, [r0], r1
488 vst1.64 {d16}, [r0], r1
489 vst1.64 {d0}, [r0], r1
490 vst1.64 {d10}, [r0], r1
491 vst1.64 {d4}, [r0], r1
492 vst1.64 {d26}, [r0], r1
493 vst1.64 {d7}, [r0], r1
494 vst1.64 {d21}, [r0], r1
495 vst1.64 {d9}, [r0], r1
496 vst1.64 {d17}, [r0], r1
497 vst1.64 {d1}, [r0], r1
498 vst1.64 {d11}, [r0], r1
499 vst1.64 {d5}, [r0], r1
500 vst1.64 {d27}, [r0], r1
501
502 align_pop_regs
503 bx lr
504 .endfunc
505
506 .macro h264_loop_filter_chroma
507 vdup.8 d22, r2 @ alpha
508 vmovl.u8 q12, d24
509 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
510 vmovl.u8 q2, d0
511 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
512 vsubw.u8 q2, q2, d16
513 vsli.16 d24, d24, #8
514 vshl.i16 q2, q2, #2
515 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
516 vaddw.u8 q2, q2, d18
517 vclt.u8 d26, d26, d22 @ < alpha
518 vsubw.u8 q2, q2, d2
519 vdup.8 d22, r3 @ beta
520 vclt.s8 d25, d24, #0
521 vrshrn.i16 d4, q2, #3
522 vclt.u8 d28, d28, d22 @ < beta
523 vbic d26, d26, d25
524 vclt.u8 d30, d30, d22 @ < beta
525 vand d26, d26, d28
526 vneg.s8 d25, d24
527 vand d26, d26, d30
528 vmin.s8 d4, d4, d24
529 vmovl.u8 q14, d16
530 vand d4, d4, d26
531 vmax.s8 d4, d4, d25
532 vmovl.u8 q11, d0
533 vaddw.s8 q14, q14, d4
534 vsubw.s8 q11, q11, d4
535 vqmovun.s16 d16, q14
536 vqmovun.s16 d0, q11
537 .endm
538
539function ff_h264_v_loop_filter_chroma_neon, export=1
540 h264_loop_filter_start
541
542 sub r0, r0, r1, lsl #1
543 vld1.64 {d18}, [r0,:64], r1
544 vld1.64 {d16}, [r0,:64], r1
545 vld1.64 {d0}, [r0,:64], r1
546 vld1.64 {d2}, [r0,:64]
547
548 h264_loop_filter_chroma
549
550 sub r0, r0, r1, lsl #1
551 vst1.64 {d16}, [r0,:64], r1
552 vst1.64 {d0}, [r0,:64], r1
553
554 bx lr
555 .endfunc
556
557function ff_h264_h_loop_filter_chroma_neon, export=1
558 h264_loop_filter_start
559
560 sub r0, r0, #2
561 vld1.32 {d18[0]}, [r0], r1
562 vld1.32 {d16[0]}, [r0], r1
563 vld1.32 {d0[0]}, [r0], r1
564 vld1.32 {d2[0]}, [r0], r1
565 vld1.32 {d18[1]}, [r0], r1
566 vld1.32 {d16[1]}, [r0], r1
567 vld1.32 {d0[1]}, [r0], r1
568 vld1.32 {d2[1]}, [r0], r1
569
570 vtrn.16 d18, d0
571 vtrn.16 d16, d2
572 vtrn.8 d18, d16
573 vtrn.8 d0, d2
574
575 h264_loop_filter_chroma
576
577 vtrn.16 d18, d0
578 vtrn.16 d16, d2
579 vtrn.8 d18, d16
580 vtrn.8 d0, d2
581
582 sub r0, r0, r1, lsl #3
583 vst1.32 {d18[0]}, [r0], r1
584 vst1.32 {d16[0]}, [r0], r1
585 vst1.32 {d0[0]}, [r0], r1
586 vst1.32 {d2[0]}, [r0], r1
587 vst1.32 {d18[1]}, [r0], r1
588 vst1.32 {d16[1]}, [r0], r1
589 vst1.32 {d0[1]}, [r0], r1
590 vst1.32 {d2[1]}, [r0], r1
591
592 bx lr
593 .endfunc
5813e05d
MR
594
595 /* H.264 qpel MC */
596
597 .macro lowpass_const r
598 movw \r, #5
599 movt \r, #20
600 vmov.32 d6[0], \r
601 .endm
602
603 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
604.if \narrow
605 t0 .req q0
606 t1 .req q8
607.else
608 t0 .req \d0
609 t1 .req \d1
610.endif
611 vext.8 d2, \r0, \r1, #2
612 vext.8 d3, \r0, \r1, #3
613 vaddl.u8 q1, d2, d3
614 vext.8 d4, \r0, \r1, #1
615 vext.8 d5, \r0, \r1, #4
616 vaddl.u8 q2, d4, d5
617 vext.8 d30, \r0, \r1, #5
618 vaddl.u8 t0, \r0, d30
619 vext.8 d18, \r2, \r3, #2
620 vmla.i16 t0, q1, d6[1]
621 vext.8 d19, \r2, \r3, #3
622 vaddl.u8 q9, d18, d19
623 vext.8 d20, \r2, \r3, #1
624 vmls.i16 t0, q2, d6[0]
625 vext.8 d21, \r2, \r3, #4
626 vaddl.u8 q10, d20, d21
627 vext.8 d31, \r2, \r3, #5
628 vaddl.u8 t1, \r2, d31
629 vmla.i16 t1, q9, d6[1]
630 vmls.i16 t1, q10, d6[0]
631.if \narrow
632 vqrshrun.s16 \d0, t0, #5
633 vqrshrun.s16 \d1, t1, #5
634.endif
635 .unreq t0
636 .unreq t1
637 .endm
638
639 .macro lowpass_8_1 r0, r1, d0, narrow=1
640.if \narrow
641 t0 .req q0
642.else
643 t0 .req \d0
644.endif
645 vext.8 d2, \r0, \r1, #2
646 vext.8 d3, \r0, \r1, #3
647 vaddl.u8 q1, d2, d3
648 vext.8 d4, \r0, \r1, #1
649 vext.8 d5, \r0, \r1, #4
650 vaddl.u8 q2, d4, d5
651 vext.8 d30, \r0, \r1, #5
652 vaddl.u8 t0, \r0, d30
653 vmla.i16 t0, q1, d6[1]
654 vmls.i16 t0, q2, d6[0]
655.if \narrow
656 vqrshrun.s16 \d0, t0, #5
657.endif
658 .unreq t0
659 .endm
660
661 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
662 vext.16 q1, \r0, \r1, #2
663 vext.16 q0, \r0, \r1, #3
664 vaddl.s16 q9, d2, d0
665 vext.16 q2, \r0, \r1, #1
666 vaddl.s16 q1, d3, d1
667 vext.16 q3, \r0, \r1, #4
668 vaddl.s16 q10, d4, d6
669 vext.16 \r1, \r0, \r1, #5
670 vaddl.s16 q2, d5, d7
671 vaddl.s16 q0, \h0, \h1
672 vaddl.s16 q8, \l0, \l1
673
674 vshl.i32 q3, q9, #4
675 vshl.i32 q9, q9, #2
676 vshl.i32 q15, q10, #2
677 vadd.i32 q9, q9, q3
678 vadd.i32 q10, q10, q15
679
680 vshl.i32 q3, q1, #4
681 vshl.i32 q1, q1, #2
682 vshl.i32 q15, q2, #2
683 vadd.i32 q1, q1, q3
684 vadd.i32 q2, q2, q15
685
686 vadd.i32 q9, q9, q8
687 vsub.i32 q9, q9, q10
688
689 vadd.i32 q1, q1, q0
690 vsub.i32 q1, q1, q2
691
692 vrshrn.s32 d18, q9, #10
693 vrshrn.s32 d19, q1, #10
694
695 vqmovun.s16 \d, q9
696 .endm
697
698function put_h264_qpel16_h_lowpass_neon_packed
699 mov r4, lr
700 mov ip, #16
701 mov r3, #8
702 bl put_h264_qpel8_h_lowpass_neon
703 sub r1, r1, r2, lsl #4
704 add r1, r1, #8
705 mov ip, #16
706 mov lr, r4
707 b put_h264_qpel8_h_lowpass_neon
708 .endfunc
709
710function put_h264_qpel16_h_lowpass_neon
711 push {lr}
712 mov ip, #16
713 bl put_h264_qpel8_h_lowpass_neon
714 sub r0, r0, r3, lsl #4
715 sub r1, r1, r2, lsl #4
716 add r0, r0, #8
717 add r1, r1, #8
718 mov ip, #16
719 pop {lr}
720 .endfunc
721
722function put_h264_qpel8_h_lowpass_neon
7231: vld1.64 {d0, d1}, [r1], r2
724 vld1.64 {d16,d17}, [r1], r2
725 subs ip, ip, #2
726 lowpass_8 d0, d1, d16, d17, d0, d16
727 vst1.64 {d0}, [r0,:64], r3
728 vst1.64 {d16}, [r0,:64], r3
729 bne 1b
730 bx lr
731 .endfunc
732
733function put_h264_qpel16_h_lowpass_l2_neon
734 push {lr}
735 mov ip, #16
736 bl put_h264_qpel8_h_lowpass_l2_neon
737 sub r0, r0, r2, lsl #4
738 sub r1, r1, r2, lsl #4
739 sub r3, r3, r2, lsl #4
740 add r0, r0, #8
741 add r1, r1, #8
742 add r3, r3, #8
743 mov ip, #16
744 pop {lr}
745 .endfunc
746
747function put_h264_qpel8_h_lowpass_l2_neon
7481: vld1.64 {d0, d1}, [r1], r2
749 vld1.64 {d16,d17}, [r1], r2
750 vld1.64 {d28}, [r3], r2
751 vld1.64 {d29}, [r3], r2
752 subs ip, ip, #2
753 lowpass_8 d0, d1, d16, d17, d0, d1
754 vrhadd.u8 q0, q0, q14
755 vst1.64 {d0}, [r0,:64], r2
756 vst1.64 {d1}, [r0,:64], r2
757 bne 1b
758 bx lr
759 .endfunc
760
761function put_h264_qpel16_v_lowpass_neon_packed
762 mov r4, lr
763 mov r2, #8
764 bl put_h264_qpel8_v_lowpass_neon
765 sub r1, r1, r3, lsl #2
766 bl put_h264_qpel8_v_lowpass_neon
767 sub r1, r1, r3, lsl #4
768 sub r1, r1, r3, lsl #2
769 add r1, r1, #8
770 bl put_h264_qpel8_v_lowpass_neon
771 sub r1, r1, r3, lsl #2
772 mov lr, r4
773 b put_h264_qpel8_v_lowpass_neon
774 .endfunc
775
776function put_h264_qpel16_v_lowpass_neon
777 mov r4, lr
778 bl put_h264_qpel8_v_lowpass_neon
779 sub r1, r1, r3, lsl #2
780 bl put_h264_qpel8_v_lowpass_neon
781 sub r0, r0, r2, lsl #4
782 add r0, r0, #8
783 sub r1, r1, r3, lsl #4
784 sub r1, r1, r3, lsl #2
785 add r1, r1, #8
786 bl put_h264_qpel8_v_lowpass_neon
787 sub r1, r1, r3, lsl #2
788 mov lr, r4
789 .endfunc
790
791function put_h264_qpel8_v_lowpass_neon
792 vld1.64 {d8}, [r1], r3
793 vld1.64 {d10}, [r1], r3
794 vld1.64 {d12}, [r1], r3
795 vld1.64 {d14}, [r1], r3
796 vld1.64 {d22}, [r1], r3
797 vld1.64 {d24}, [r1], r3
798 vld1.64 {d26}, [r1], r3
799 vld1.64 {d28}, [r1], r3
800 vld1.64 {d9}, [r1], r3
801 vld1.64 {d11}, [r1], r3
802 vld1.64 {d13}, [r1], r3
803 vld1.64 {d15}, [r1], r3
804 vld1.64 {d23}, [r1]
805
806 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
807 lowpass_8 d8, d9, d10, d11, d8, d10
808 lowpass_8 d12, d13, d14, d15, d12, d14
809 lowpass_8 d22, d23, d24, d25, d22, d24
810 lowpass_8 d26, d27, d28, d29, d26, d28
811 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
812
813 vst1.64 {d8}, [r0,:64], r2
814 vst1.64 {d10}, [r0,:64], r2
815 vst1.64 {d12}, [r0,:64], r2
816 vst1.64 {d14}, [r0,:64], r2
817 vst1.64 {d22}, [r0,:64], r2
818 vst1.64 {d24}, [r0,:64], r2
819 vst1.64 {d26}, [r0,:64], r2
820 vst1.64 {d28}, [r0,:64], r2
821
822 bx lr
823 .endfunc
824
825function put_h264_qpel16_v_lowpass_l2_neon
826 mov r4, lr
827 bl put_h264_qpel8_v_lowpass_l2_neon
828 sub r1, r1, r3, lsl #2
829 bl put_h264_qpel8_v_lowpass_l2_neon
830 sub r0, r0, r3, lsl #4
831 sub ip, ip, r2, lsl #4
832 add r0, r0, #8
833 add ip, ip, #8
834 sub r1, r1, r3, lsl #4
835 sub r1, r1, r3, lsl #2
836 add r1, r1, #8
837 bl put_h264_qpel8_v_lowpass_l2_neon
838 sub r1, r1, r3, lsl #2
839 mov lr, r4
840 .endfunc
841
842function put_h264_qpel8_v_lowpass_l2_neon
843 vld1.64 {d8}, [r1], r3
844 vld1.64 {d10}, [r1], r3
845 vld1.64 {d12}, [r1], r3
846 vld1.64 {d14}, [r1], r3
847 vld1.64 {d22}, [r1], r3
848 vld1.64 {d24}, [r1], r3
849 vld1.64 {d26}, [r1], r3
850 vld1.64 {d28}, [r1], r3
851 vld1.64 {d9}, [r1], r3
852 vld1.64 {d11}, [r1], r3
853 vld1.64 {d13}, [r1], r3
854 vld1.64 {d15}, [r1], r3
855 vld1.64 {d23}, [r1]
856
857 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
858 lowpass_8 d8, d9, d10, d11, d8, d9
859 lowpass_8 d12, d13, d14, d15, d12, d13
860 lowpass_8 d22, d23, d24, d25, d22, d23
861 lowpass_8 d26, d27, d28, d29, d26, d27
862 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
863
864 vld1.64 {d0}, [ip], r2
865 vld1.64 {d1}, [ip], r2
866 vld1.64 {d2}, [ip], r2
867 vld1.64 {d3}, [ip], r2
868 vld1.64 {d4}, [ip], r2
869 vrhadd.u8 q0, q0, q4
870 vld1.64 {d5}, [ip], r2
871 vrhadd.u8 q1, q1, q6
872 vld1.64 {d10}, [ip], r2
873 vrhadd.u8 q2, q2, q11
874 vld1.64 {d11}, [ip], r2
875
876 vst1.64 {d0}, [r0,:64], r3
877 vst1.64 {d1}, [r0,:64], r3
878 vrhadd.u8 q5, q5, q13
879 vst1.64 {d2}, [r0,:64], r3
880 vst1.64 {d3}, [r0,:64], r3
881 vst1.64 {d4}, [r0,:64], r3
882 vst1.64 {d5}, [r0,:64], r3
883 vst1.64 {d10}, [r0,:64], r3
884 vst1.64 {d11}, [r0,:64], r3
885
886 bx lr
887 .endfunc
888
889function put_h264_qpel8_hv_lowpass_neon_top
890 lowpass_const ip
891 mov ip, #12
8921: vld1.64 {d0, d1}, [r1], r3
893 vld1.64 {d16,d17}, [r1], r3
894 subs ip, ip, #2
895 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
896 vst1.64 {d22-d25}, [r4,:128]!
897 bne 1b
898
899 vld1.64 {d0, d1}, [r1]
900 lowpass_8_1 d0, d1, q12, narrow=0
901
902 mov ip, #-16
903 add r4, r4, ip
904 vld1.64 {d30,d31}, [r4,:128], ip
905 vld1.64 {d20,d21}, [r4,:128], ip
906 vld1.64 {d18,d19}, [r4,:128], ip
907 vld1.64 {d16,d17}, [r4,:128], ip
908 vld1.64 {d14,d15}, [r4,:128], ip
909 vld1.64 {d12,d13}, [r4,:128], ip
910 vld1.64 {d10,d11}, [r4,:128], ip
911 vld1.64 {d8, d9}, [r4,:128], ip
912 vld1.64 {d6, d7}, [r4,:128], ip
913 vld1.64 {d4, d5}, [r4,:128], ip
914 vld1.64 {d2, d3}, [r4,:128], ip
915 vld1.64 {d0, d1}, [r4,:128]
916
917 swap4 d1, d3, d5, d7, d8, d10, d12, d14
918 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
919
920 swap4 d17, d19, d21, d31, d24, d26, d28, d22
921 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
922
923 vst1.64 {d30,d31}, [r4,:128]!
924 vst1.64 {d6, d7}, [r4,:128]!
925 vst1.64 {d20,d21}, [r4,:128]!
926 vst1.64 {d4, d5}, [r4,:128]!
927 vst1.64 {d18,d19}, [r4,:128]!
928 vst1.64 {d2, d3}, [r4,:128]!
929 vst1.64 {d16,d17}, [r4,:128]!
930 vst1.64 {d0, d1}, [r4,:128]
931
932 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
933 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
934 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
935 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
936
937 vld1.64 {d16,d17}, [r4,:128], ip
938 vld1.64 {d30,d31}, [r4,:128], ip
939 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
940 vld1.64 {d16,d17}, [r4,:128], ip
941 vld1.64 {d30,d31}, [r4,:128], ip
942 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
943 vld1.64 {d16,d17}, [r4,:128], ip
944 vld1.64 {d30,d31}, [r4,:128], ip
945 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
946 vld1.64 {d16,d17}, [r4,:128], ip
947 vld1.64 {d30,d31}, [r4,:128]
948 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
949
950 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
951
952 bx lr
953 .endfunc
954
955function put_h264_qpel8_hv_lowpass_neon
956 mov r10, lr
957 bl put_h264_qpel8_hv_lowpass_neon_top
958 vst1.64 {d12}, [r0,:64], r2
959 vst1.64 {d13}, [r0,:64], r2
960 vst1.64 {d14}, [r0,:64], r2
961 vst1.64 {d15}, [r0,:64], r2
962 vst1.64 {d8}, [r0,:64], r2
963 vst1.64 {d9}, [r0,:64], r2
964 vst1.64 {d10}, [r0,:64], r2
965 vst1.64 {d11}, [r0,:64], r2
966
967 mov lr, r10
968 bx lr
969 .endfunc
970
971function put_h264_qpel8_hv_lowpass_l2_neon
972 mov r10, lr
973 bl put_h264_qpel8_hv_lowpass_neon_top
974
975 vld1.64 {d0, d1}, [r2,:128]!
976 vld1.64 {d2, d3}, [r2,:128]!
977 vrhadd.u8 q0, q0, q6
978 vld1.64 {d4, d5}, [r2,:128]!
979 vrhadd.u8 q1, q1, q7
980 vld1.64 {d6, d7}, [r2,:128]!
981 vrhadd.u8 q2, q2, q4
982
983 vst1.64 {d0}, [r0,:64], r3
984 vrhadd.u8 q3, q3, q5
985 vst1.64 {d1}, [r0,:64], r3
986 vst1.64 {d2}, [r0,:64], r3
987 vst1.64 {d3}, [r0,:64], r3
988 vst1.64 {d4}, [r0,:64], r3
989 vst1.64 {d5}, [r0,:64], r3
990 vst1.64 {d6}, [r0,:64], r3
991 vst1.64 {d7}, [r0,:64], r3
992
993 mov lr, r10
994 bx lr
995 .endfunc
996
997function put_h264_qpel16_hv_lowpass_neon
998 mov r9, lr
999 bl put_h264_qpel8_hv_lowpass_neon
1000 sub r1, r1, r3, lsl #2
1001 bl put_h264_qpel8_hv_lowpass_neon
1002 sub r1, r1, r3, lsl #4
1003 sub r1, r1, r3, lsl #2
1004 add r1, r1, #8
1005 sub r0, r0, r2, lsl #4
1006 add r0, r0, #8
1007 bl put_h264_qpel8_hv_lowpass_neon
1008 sub r1, r1, r3, lsl #2
1009 mov lr, r9
1010 b put_h264_qpel8_hv_lowpass_neon
1011 .endfunc
1012
1013function put_h264_qpel16_hv_lowpass_l2_neon
1014 mov r9, lr
1015 sub r2, r4, #256
1016 bl put_h264_qpel8_hv_lowpass_l2_neon
1017 sub r1, r1, r3, lsl #2
1018 bl put_h264_qpel8_hv_lowpass_l2_neon
1019 sub r1, r1, r3, lsl #4
1020 sub r1, r1, r3, lsl #2
1021 add r1, r1, #8
1022 sub r0, r0, r3, lsl #4
1023 add r0, r0, #8
1024 bl put_h264_qpel8_hv_lowpass_l2_neon
1025 sub r1, r1, r3, lsl #2
1026 mov lr, r9
1027 b put_h264_qpel8_hv_lowpass_l2_neon
1028 .endfunc
1029
1030function ff_put_h264_qpel8_mc10_neon, export=1
1031 lowpass_const r3
1032 mov r3, r1
1033 sub r1, r1, #2
1034 mov ip, #8
1035 b put_h264_qpel8_h_lowpass_l2_neon
1036 .endfunc
1037
1038function ff_put_h264_qpel8_mc20_neon, export=1
1039 lowpass_const r3
1040 sub r1, r1, #2
1041 mov r3, r2
1042 mov ip, #8
1043 b put_h264_qpel8_h_lowpass_neon
1044 .endfunc
1045
1046function ff_put_h264_qpel8_mc30_neon, export=1
1047 lowpass_const r3
1048 add r3, r1, #1
1049 sub r1, r1, #2
1050 mov ip, #8
1051 b put_h264_qpel8_h_lowpass_l2_neon
1052 .endfunc
1053
1054function ff_put_h264_qpel8_mc01_neon, export=1
1055 push {lr}
1056 mov ip, r1
1057put_h264_qpel8_mc01:
1058 lowpass_const r3
1059 mov r3, r2
1060 sub r1, r1, r2, lsl #1
1061 vpush {d8-d15}
1062 bl put_h264_qpel8_v_lowpass_l2_neon
1063 vpop {d8-d15}
1064 pop {pc}
1065 .endfunc
1066
1067function ff_put_h264_qpel8_mc11_neon, export=1
1068 push {r0, r1, r2, lr}
1069put_h264_qpel8_mc11:
1070 lowpass_const r3
1071 sub sp, sp, #64
1072 mov r0, sp
1073 sub r1, r1, #2
1074 mov r3, #8
1075 mov ip, #8
1076 vpush {d8-d15}
1077 bl put_h264_qpel8_h_lowpass_neon
1078 ldrd r0, [sp, #128]
1079 mov r3, r2
1080 add ip, sp, #64
1081 sub r1, r1, r2, lsl #1
1082 mov r2, #8
1083 bl put_h264_qpel8_v_lowpass_l2_neon
1084 vpop {d8-d15}
1085 add sp, sp, #76
1086 pop {pc}
1087 .endfunc
1088
1089function ff_put_h264_qpel8_mc21_neon, export=1
1090 push {r0, r1, r4, r10, r11, lr}
1091put_h264_qpel8_mc21:
1092 lowpass_const r3
1093 mov r11, sp
1094 bic sp, sp, #15
1095 sub sp, sp, #(8*8+16*12)
1096 sub r1, r1, #2
1097 mov r3, #8
1098 mov r0, sp
1099 mov ip, #8
1100 vpush {d8-d15}
1101 bl put_h264_qpel8_h_lowpass_neon
1102 mov r4, r0
1103 ldrd r0, [r11]
1104 sub r1, r1, r2, lsl #1
1105 sub r1, r1, #2
1106 mov r3, r2
1107 sub r2, r4, #64
1108 bl put_h264_qpel8_hv_lowpass_l2_neon
1109 vpop {d8-d15}
1110 add sp, r11, #8
1111 pop {r4, r10, r11, pc}
1112 .endfunc
1113
1114function ff_put_h264_qpel8_mc31_neon, export=1
1115 add r1, r1, #1
1116 push {r0, r1, r2, lr}
1117 sub r1, r1, #1
1118 b put_h264_qpel8_mc11
1119 .endfunc
1120
1121function ff_put_h264_qpel8_mc02_neon, export=1
1122 push {lr}
1123 lowpass_const r3
1124 sub r1, r1, r2, lsl #1
1125 mov r3, r2
1126 vpush {d8-d15}
1127 bl put_h264_qpel8_v_lowpass_neon
1128 vpop {d8-d15}
1129 pop {pc}
1130 .endfunc
1131
1132function ff_put_h264_qpel8_mc12_neon, export=1
1133 push {r0, r1, r4, r10, r11, lr}
1134put_h264_qpel8_mc12:
1135 lowpass_const r3
1136 mov r11, sp
1137 bic sp, sp, #15
1138 sub sp, sp, #(8*8+16*12)
1139 sub r1, r1, r2, lsl #1
1140 mov r3, r2
1141 mov r2, #8
1142 mov r0, sp
1143 vpush {d8-d15}
1144 bl put_h264_qpel8_v_lowpass_neon
1145 mov r4, r0
1146 ldrd r0, [r11]
1147 sub r1, r1, r3, lsl #1
1148 sub r1, r1, #2
1149 sub r2, r4, #64
1150 bl put_h264_qpel8_hv_lowpass_l2_neon
1151 vpop {d8-d15}
1152 add sp, r11, #8
1153 pop {r4, r10, r11, pc}
1154 .endfunc
1155
1156function ff_put_h264_qpel8_mc22_neon, export=1
1157 push {r4, r10, r11, lr}
1158 mov r11, sp
1159 bic sp, sp, #15
1160 sub r1, r1, r2, lsl #1
1161 sub r1, r1, #2
1162 mov r3, r2
1163 sub sp, sp, #(16*12)
1164 mov r4, sp
1165 vpush {d8-d15}
1166 bl put_h264_qpel8_hv_lowpass_neon
1167 vpop {d8-d15}
1168 mov sp, r11
1169 pop {r4, r10, r11, pc}
1170 .endfunc
1171
1172function ff_put_h264_qpel8_mc32_neon, export=1
1173 push {r0, r1, r4, r10, r11, lr}
1174 add r1, r1, #1
1175 b put_h264_qpel8_mc12
1176 .endfunc
1177
1178function ff_put_h264_qpel8_mc03_neon, export=1
1179 push {lr}
1180 add ip, r1, r2
1181 b put_h264_qpel8_mc01
1182 .endfunc
1183
1184function ff_put_h264_qpel8_mc13_neon, export=1
1185 push {r0, r1, r2, lr}
1186 add r1, r1, r2
1187 b put_h264_qpel8_mc11
1188 .endfunc
1189
1190function ff_put_h264_qpel8_mc23_neon, export=1
1191 push {r0, r1, r4, r10, r11, lr}
1192 add r1, r1, r2
1193 b put_h264_qpel8_mc21
1194 .endfunc
1195
1196function ff_put_h264_qpel8_mc33_neon, export=1
1197 add r1, r1, #1
1198 push {r0, r1, r2, lr}
1199 add r1, r1, r2
1200 sub r1, r1, #1
1201 b put_h264_qpel8_mc11
1202 .endfunc
1203
1204function ff_put_h264_qpel16_mc10_neon, export=1
1205 lowpass_const r3
1206 mov r3, r1
1207 sub r1, r1, #2
1208 b put_h264_qpel16_h_lowpass_l2_neon
1209 .endfunc
1210
1211function ff_put_h264_qpel16_mc20_neon, export=1
1212 lowpass_const r3
1213 sub r1, r1, #2
1214 mov r3, r2
1215 b put_h264_qpel16_h_lowpass_neon
1216 .endfunc
1217
1218function ff_put_h264_qpel16_mc30_neon, export=1
1219 lowpass_const r3
1220 add r3, r1, #1
1221 sub r1, r1, #2
1222 b put_h264_qpel16_h_lowpass_l2_neon
1223 .endfunc
1224
1225function ff_put_h264_qpel16_mc01_neon, export=1
1226 push {r4, lr}
1227 mov ip, r1
1228put_h264_qpel16_mc01:
1229 lowpass_const r3
1230 mov r3, r2
1231 sub r1, r1, r2, lsl #1
1232 vpush {d8-d15}
1233 bl put_h264_qpel16_v_lowpass_l2_neon
1234 vpop {d8-d15}
1235 pop {r4, pc}
1236 .endfunc
1237
1238function ff_put_h264_qpel16_mc11_neon, export=1
1239 push {r0, r1, r4, lr}
1240put_h264_qpel16_mc11:
1241 lowpass_const r3
1242 sub sp, sp, #256
1243 mov r0, sp
1244 sub r1, r1, #2
1245 mov r3, #16
1246 vpush {d8-d15}
1247 bl put_h264_qpel16_h_lowpass_neon
1248 add r0, sp, #256
1249 ldrd r0, [r0, #64]
1250 mov r3, r2
1251 add ip, sp, #64
1252 sub r1, r1, r2, lsl #1
1253 mov r2, #16
1254 bl put_h264_qpel16_v_lowpass_l2_neon
1255 vpop {d8-d15}
1256 add sp, sp, #(256+8)
1257 pop {r4, pc}
1258 .endfunc
1259
1260function ff_put_h264_qpel16_mc21_neon, export=1
1261 push {r0, r1, r4-r5, r9-r11, lr}
1262put_h264_qpel16_mc21:
1263 lowpass_const r3
1264 mov r11, sp
1265 bic sp, sp, #15
1266 sub sp, sp, #(16*16+16*12)
1267 sub r1, r1, #2
1268 mov r0, sp
1269 vpush {d8-d15}
1270 bl put_h264_qpel16_h_lowpass_neon_packed
1271 mov r4, r0
1272 ldrd r0, [r11]
1273 sub r1, r1, r2, lsl #1
1274 sub r1, r1, #2
1275 mov r3, r2
1276 bl put_h264_qpel16_hv_lowpass_l2_neon
1277 vpop {d8-d15}
1278 add sp, r11, #8
1279 pop {r4-r5, r9-r11, pc}
1280 .endfunc
1281
1282function ff_put_h264_qpel16_mc31_neon, export=1
1283 add r1, r1, #1
1284 push {r0, r1, r4, lr}
1285 sub r1, r1, #1
1286 b put_h264_qpel16_mc11
1287 .endfunc
1288
1289function ff_put_h264_qpel16_mc02_neon, export=1
1290 push {r4, lr}
1291 lowpass_const r3
1292 sub r1, r1, r2, lsl #1
1293 mov r3, r2
1294 vpush {d8-d15}
1295 bl put_h264_qpel16_v_lowpass_neon
1296 vpop {d8-d15}
1297 pop {r4, pc}
1298 .endfunc
1299
1300function ff_put_h264_qpel16_mc12_neon, export=1
1301 push {r0, r1, r4-r5, r9-r11, lr}
1302put_h264_qpel16_mc12:
1303 lowpass_const r3
1304 mov r11, sp
1305 bic sp, sp, #15
1306 sub sp, sp, #(16*16+16*12)
1307 sub r1, r1, r2, lsl #1
1308 mov r0, sp
1309 mov r3, r2
1310 vpush {d8-d15}
1311 bl put_h264_qpel16_v_lowpass_neon_packed
1312 mov r4, r0
1313 ldrd r0, [r11]
1314 sub r1, r1, r3, lsl #1
1315 sub r1, r1, #2
1316 mov r2, r3
1317 bl put_h264_qpel16_hv_lowpass_l2_neon
1318 vpop {d8-d15}
1319 add sp, r11, #8
1320 pop {r4-r5, r9-r11, pc}
1321 .endfunc
1322
1323function ff_put_h264_qpel16_mc22_neon, export=1
1324 push {r4, r9-r11, lr}
1325 lowpass_const r3
1326 mov r11, sp
1327 bic sp, sp, #15
1328 sub r1, r1, r2, lsl #1
1329 sub r1, r1, #2
1330 mov r3, r2
1331 sub sp, sp, #(16*12)
1332 mov r4, sp
1333 vpush {d8-d15}
1334 bl put_h264_qpel16_hv_lowpass_neon
1335 vpop {d8-d15}
1336 mov sp, r11
1337 pop {r4, r9-r11, pc}
1338 .endfunc
1339
1340function ff_put_h264_qpel16_mc32_neon, export=1
1341 push {r0, r1, r4-r5, r9-r11, lr}
1342 add r1, r1, #1
1343 b put_h264_qpel16_mc12
1344 .endfunc
1345
1346function ff_put_h264_qpel16_mc03_neon, export=1
1347 push {r4, lr}
1348 add ip, r1, r2
1349 b put_h264_qpel16_mc01
1350 .endfunc
1351
1352function ff_put_h264_qpel16_mc13_neon, export=1
1353 push {r0, r1, r4, lr}
1354 add r1, r1, r2
1355 b put_h264_qpel16_mc11
1356 .endfunc
1357
1358function ff_put_h264_qpel16_mc23_neon, export=1
1359 push {r0, r1, r4-r5, r9-r11, lr}
1360 add r1, r1, r2
1361 b put_h264_qpel16_mc21
1362 .endfunc
1363
1364function ff_put_h264_qpel16_mc33_neon, export=1
1365 add r1, r1, #1
1366 push {r0, r1, r4, lr}
1367 add r1, r1, r2
1368 sub r1, r1, #1
1369 b put_h264_qpel16_mc11
1370 .endfunc