edfce3a16809caa1a96691a8c1549d37312fb8b3
[libav.git] / libavcodec / arm / h264dsp_neon.S
1 /*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "asm.S"
22
23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24 vtrn.32 \r0, \r4
25 vtrn.32 \r1, \r5
26 vtrn.32 \r2, \r6
27 vtrn.32 \r3, \r7
28 vtrn.16 \r0, \r2
29 vtrn.16 \r1, \r3
30 vtrn.16 \r4, \r6
31 vtrn.16 \r5, \r7
32 vtrn.8 \r0, \r1
33 vtrn.8 \r2, \r3
34 vtrn.8 \r4, \r5
35 vtrn.8 \r6, \r7
36 .endm
37
38 .macro transpose_4x4 r0 r1 r2 r3
39 vtrn.16 \r0, \r2
40 vtrn.16 \r1, \r3
41 vtrn.8 \r0, \r1
42 vtrn.8 \r2, \r3
43 .endm
44
45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46 vswp \r0, \r4
47 vswp \r1, \r5
48 vswp \r2, \r6
49 vswp \r3, \r7
50 .endm
51
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53 vtrn.32 \r0, \r2
54 vtrn.32 \r1, \r3
55 vtrn.32 \r4, \r6
56 vtrn.32 \r5, \r7
57 vtrn.16 \r0, \r1
58 vtrn.16 \r2, \r3
59 vtrn.16 \r4, \r5
60 vtrn.16 \r6, \r7
61 .endm
62
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64 .macro h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
66 push {r4-r7, lr}
67 ldrd r4, [sp, #20]
68 .ifc \type,avg
69 mov lr, r0
70 .endif
71 pld [r1]
72 pld [r1, r2]
73
74 muls r7, r4, r5
75 rsb r6, r7, r5, lsl #3
76 rsb ip, r7, r4, lsl #3
77 sub r4, r7, r4, lsl #3
78 sub r4, r4, r5, lsl #3
79 add r4, r4, #64
80
81 beq 2f
82
83 add r5, r1, r2
84
85 vdup.8 d0, r4
86 lsl r4, r2, #1
87 vdup.8 d1, ip
88 vld1.64 {d4, d5}, [r1], r4
89 vdup.8 d2, r6
90 vld1.64 {d6, d7}, [r5], r4
91 vdup.8 d3, r7
92
93 vext.8 d5, d4, d5, #1
94 vext.8 d7, d6, d7, #1
95
96 1: pld [r5]
97 vmull.u8 q8, d4, d0
98 vmlal.u8 q8, d5, d1
99 vld1.64 {d4, d5}, [r1], r4
100 vmlal.u8 q8, d6, d2
101 vext.8 d5, d4, d5, #1
102 vmlal.u8 q8, d7, d3
103 vmull.u8 q9, d6, d0
104 subs r3, r3, #2
105 vmlal.u8 q9, d7, d1
106 vmlal.u8 q9, d4, d2
107 vmlal.u8 q9, d5, d3
108 vrshrn.u16 d16, q8, #6
109 vld1.64 {d6, d7}, [r5], r4
110 pld [r1]
111 vrshrn.u16 d17, q9, #6
112 .ifc \type,avg
113 vld1.64 {d20}, [lr,:64], r2
114 vld1.64 {d21}, [lr,:64], r2
115 vrhadd.u8 q8, q8, q10
116 .endif
117 vext.8 d7, d6, d7, #1
118 vst1.64 {d16}, [r0,:64], r2
119 vst1.64 {d17}, [r0,:64], r2
120 bgt 1b
121
122 pop {r4-r7, pc}
123
124 2: tst r6, r6
125 add ip, ip, r6
126 vdup.8 d0, r4
127 vdup.8 d1, ip
128
129 beq 4f
130
131 add r5, r1, r2
132 lsl r4, r2, #1
133 vld1.64 {d4}, [r1], r4
134 vld1.64 {d6}, [r5], r4
135
136 3: pld [r5]
137 vmull.u8 q8, d4, d0
138 vmlal.u8 q8, d6, d1
139 vld1.64 {d4}, [r1], r4
140 vmull.u8 q9, d6, d0
141 vmlal.u8 q9, d4, d1
142 vld1.64 {d6}, [r5], r4
143 vrshrn.u16 d16, q8, #6
144 vrshrn.u16 d17, q9, #6
145 .ifc \type,avg
146 vld1.64 {d20}, [lr,:64], r2
147 vld1.64 {d21}, [lr,:64], r2
148 vrhadd.u8 q8, q8, q10
149 .endif
150 subs r3, r3, #2
151 pld [r1]
152 vst1.64 {d16}, [r0,:64], r2
153 vst1.64 {d17}, [r0,:64], r2
154 bgt 3b
155
156 pop {r4-r7, pc}
157
158 4: vld1.64 {d4, d5}, [r1], r2
159 vld1.64 {d6, d7}, [r1], r2
160 vext.8 d5, d4, d5, #1
161 vext.8 d7, d6, d7, #1
162
163 5: pld [r1]
164 subs r3, r3, #2
165 vmull.u8 q8, d4, d0
166 vmlal.u8 q8, d5, d1
167 vld1.64 {d4, d5}, [r1], r2
168 vmull.u8 q9, d6, d0
169 vmlal.u8 q9, d7, d1
170 pld [r1]
171 vext.8 d5, d4, d5, #1
172 vrshrn.u16 d16, q8, #6
173 vrshrn.u16 d17, q9, #6
174 .ifc \type,avg
175 vld1.64 {d20}, [lr,:64], r2
176 vld1.64 {d21}, [lr,:64], r2
177 vrhadd.u8 q8, q8, q10
178 .endif
179 vld1.64 {d6, d7}, [r1], r2
180 vext.8 d7, d6, d7, #1
181 vst1.64 {d16}, [r0,:64], r2
182 vst1.64 {d17}, [r0,:64], r2
183 bgt 5b
184
185 pop {r4-r7, pc}
186 .endfunc
187 .endm
188
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190 .macro h264_chroma_mc4 type
191 function ff_\type\()_h264_chroma_mc4_neon, export=1
192 push {r4-r7, lr}
193 ldrd r4, [sp, #20]
194 .ifc \type,avg
195 mov lr, r0
196 .endif
197 pld [r1]
198 pld [r1, r2]
199
200 muls r7, r4, r5
201 rsb r6, r7, r5, lsl #3
202 rsb ip, r7, r4, lsl #3
203 sub r4, r7, r4, lsl #3
204 sub r4, r4, r5, lsl #3
205 add r4, r4, #64
206
207 beq 2f
208
209 add r5, r1, r2
210
211 vdup.8 d0, r4
212 lsl r4, r2, #1
213 vdup.8 d1, ip
214 vld1.64 {d4}, [r1], r4
215 vdup.8 d2, r6
216 vld1.64 {d6}, [r5], r4
217 vdup.8 d3, r7
218
219 vext.8 d5, d4, d5, #1
220 vext.8 d7, d6, d7, #1
221 vtrn.32 d4, d5
222 vtrn.32 d6, d7
223
224 vtrn.32 d0, d1
225 vtrn.32 d2, d3
226
227 1: pld [r5]
228 vmull.u8 q8, d4, d0
229 vmlal.u8 q8, d6, d2
230 vld1.64 {d4}, [r1], r4
231 vext.8 d5, d4, d5, #1
232 vtrn.32 d4, d5
233 vmull.u8 q9, d6, d0
234 vmlal.u8 q9, d4, d2
235 vld1.64 {d6}, [r5], r4
236 vadd.i16 d16, d16, d17
237 vadd.i16 d17, d18, d19
238 vrshrn.u16 d16, q8, #6
239 subs r3, r3, #2
240 pld [r1]
241 .ifc \type,avg
242 vld1.32 {d20[0]}, [lr,:32], r2
243 vld1.32 {d20[1]}, [lr,:32], r2
244 vrhadd.u8 d16, d16, d20
245 .endif
246 vext.8 d7, d6, d7, #1
247 vtrn.32 d6, d7
248 vst1.32 {d16[0]}, [r0,:32], r2
249 vst1.32 {d16[1]}, [r0,:32], r2
250 bgt 1b
251
252 pop {r4-r7, pc}
253
254 2: tst r6, r6
255 add ip, ip, r6
256 vdup.8 d0, r4
257 vdup.8 d1, ip
258 vtrn.32 d0, d1
259
260 beq 4f
261
262 vext.32 d1, d0, d1, #1
263 add r5, r1, r2
264 lsl r4, r2, #1
265 vld1.32 {d4[0]}, [r1], r4
266 vld1.32 {d4[1]}, [r5], r4
267
268 3: pld [r5]
269 vmull.u8 q8, d4, d0
270 vld1.32 {d4[0]}, [r1], r4
271 vmull.u8 q9, d4, d1
272 vld1.32 {d4[1]}, [r5], r4
273 vadd.i16 d16, d16, d17
274 vadd.i16 d17, d18, d19
275 vrshrn.u16 d16, q8, #6
276 .ifc \type,avg
277 vld1.32 {d20[0]}, [lr,:32], r2
278 vld1.32 {d20[1]}, [lr,:32], r2
279 vrhadd.u8 d16, d16, d20
280 .endif
281 subs r3, r3, #2
282 pld [r1]
283 vst1.32 {d16[0]}, [r0,:32], r2
284 vst1.32 {d16[1]}, [r0,:32], r2
285 bgt 3b
286
287 pop {r4-r7, pc}
288
289 4: vld1.64 {d4}, [r1], r2
290 vld1.64 {d6}, [r1], r2
291 vext.8 d5, d4, d5, #1
292 vext.8 d7, d6, d7, #1
293 vtrn.32 d4, d5
294 vtrn.32 d6, d7
295
296 5: vmull.u8 q8, d4, d0
297 vmull.u8 q9, d6, d0
298 subs r3, r3, #2
299 vld1.64 {d4}, [r1], r2
300 vext.8 d5, d4, d5, #1
301 vtrn.32 d4, d5
302 vadd.i16 d16, d16, d17
303 vadd.i16 d17, d18, d19
304 pld [r1]
305 vrshrn.u16 d16, q8, #6
306 .ifc \type,avg
307 vld1.32 {d20[0]}, [lr,:32], r2
308 vld1.32 {d20[1]}, [lr,:32], r2
309 vrhadd.u8 d16, d16, d20
310 .endif
311 vld1.64 {d6}, [r1], r2
312 vext.8 d7, d6, d7, #1
313 vtrn.32 d6, d7
314 pld [r1]
315 vst1.32 {d16[0]}, [r0,:32], r2
316 vst1.32 {d16[1]}, [r0,:32], r2
317 bgt 5b
318
319 pop {r4-r7, pc}
320 .endfunc
321 .endm
322
323 .text
324 .align
325
326 h264_chroma_mc8 put
327 h264_chroma_mc8 avg
328 h264_chroma_mc4 put
329 h264_chroma_mc4 avg
330
331 /* H.264 loop filter */
332
333 .macro h264_loop_filter_start
334 ldr ip, [sp]
335 tst r2, r2
336 ldr ip, [ip]
337 tstne r3, r3
338 vmov.32 d24[0], ip
339 and ip, ip, ip, lsl #16
340 bxeq lr
341 ands ip, ip, ip, lsl #8
342 bxlt lr
343 .endm
344
345 .macro align_push_regs
346 and ip, sp, #15
347 add ip, ip, #32
348 sub sp, sp, ip
349 vst1.64 {d12-d15}, [sp,:128]
350 sub sp, sp, #32
351 vst1.64 {d8-d11}, [sp,:128]
352 .endm
353
354 .macro align_pop_regs
355 vld1.64 {d8-d11}, [sp,:128]!
356 vld1.64 {d12-d15}, [sp,:128], ip
357 .endm
358
359 .macro h264_loop_filter_luma
360 vdup.8 q11, r2 @ alpha
361 vmovl.u8 q12, d24
362 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
363 vmovl.u16 q12, d24
364 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
365 vsli.16 q12, q12, #8
366 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
367 vsli.32 q12, q12, #16
368 vclt.u8 q6, q6, q11 @ < alpha
369 vdup.8 q11, r3 @ beta
370 vclt.s8 q7, q12, #0
371 vclt.u8 q14, q14, q11 @ < beta
372 vclt.u8 q15, q15, q11 @ < beta
373 vbic q6, q6, q7
374 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
375 vand q6, q6, q14
376 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
377 vclt.u8 q4, q4, q11 @ < beta
378 vand q6, q6, q15
379 vclt.u8 q5, q5, q11 @ < beta
380 vand q4, q4, q6
381 vand q5, q5, q6
382 vand q12, q12, q6
383 vrhadd.u8 q14, q8, q0
384 vsub.i8 q6, q12, q4
385 vqadd.u8 q7, q9, q12
386 vhadd.u8 q10, q10, q14
387 vsub.i8 q6, q6, q5
388 vhadd.u8 q14, q2, q14
389 vmin.u8 q7, q7, q10
390 vqsub.u8 q11, q9, q12
391 vqadd.u8 q2, q1, q12
392 vmax.u8 q7, q7, q11
393 vqsub.u8 q11, q1, q12
394 vmin.u8 q14, q2, q14
395 vmovl.u8 q2, d0
396 vmax.u8 q14, q14, q11
397 vmovl.u8 q10, d1
398 vsubw.u8 q2, q2, d16
399 vsubw.u8 q10, q10, d17
400 vshl.i16 q2, q2, #2
401 vshl.i16 q10, q10, #2
402 vaddw.u8 q2, q2, d18
403 vaddw.u8 q10, q10, d19
404 vsubw.u8 q2, q2, d2
405 vsubw.u8 q10, q10, d3
406 vrshrn.i16 d4, q2, #3
407 vrshrn.i16 d5, q10, #3
408 vbsl q4, q7, q9
409 vbsl q5, q14, q1
410 vneg.s8 q7, q6
411 vmovl.u8 q14, d16
412 vmin.s8 q2, q2, q6
413 vmovl.u8 q6, d17
414 vmax.s8 q2, q2, q7
415 vmovl.u8 q11, d0
416 vmovl.u8 q12, d1
417 vaddw.s8 q14, q14, d4
418 vaddw.s8 q6, q6, d5
419 vsubw.s8 q11, q11, d4
420 vsubw.s8 q12, q12, d5
421 vqmovun.s16 d16, q14
422 vqmovun.s16 d17, q6
423 vqmovun.s16 d0, q11
424 vqmovun.s16 d1, q12
425 .endm
426
427 function ff_h264_v_loop_filter_luma_neon, export=1
428 h264_loop_filter_start
429
430 vld1.64 {d0, d1}, [r0,:128], r1
431 vld1.64 {d2, d3}, [r0,:128], r1
432 vld1.64 {d4, d5}, [r0,:128], r1
433 sub r0, r0, r1, lsl #2
434 sub r0, r0, r1, lsl #1
435 vld1.64 {d20,d21}, [r0,:128], r1
436 vld1.64 {d18,d19}, [r0,:128], r1
437 vld1.64 {d16,d17}, [r0,:128], r1
438
439 align_push_regs
440
441 h264_loop_filter_luma
442
443 sub r0, r0, r1, lsl #1
444 vst1.64 {d8, d9}, [r0,:128], r1
445 vst1.64 {d16,d17}, [r0,:128], r1
446 vst1.64 {d0, d1}, [r0,:128], r1
447 vst1.64 {d10,d11}, [r0,:128]
448
449 align_pop_regs
450 bx lr
451 .endfunc
452
453 function ff_h264_h_loop_filter_luma_neon, export=1
454 h264_loop_filter_start
455
456 sub r0, r0, #4
457 vld1.64 {d6}, [r0], r1
458 vld1.64 {d20}, [r0], r1
459 vld1.64 {d18}, [r0], r1
460 vld1.64 {d16}, [r0], r1
461 vld1.64 {d0}, [r0], r1
462 vld1.64 {d2}, [r0], r1
463 vld1.64 {d4}, [r0], r1
464 vld1.64 {d26}, [r0], r1
465 vld1.64 {d7}, [r0], r1
466 vld1.64 {d21}, [r0], r1
467 vld1.64 {d19}, [r0], r1
468 vld1.64 {d17}, [r0], r1
469 vld1.64 {d1}, [r0], r1
470 vld1.64 {d3}, [r0], r1
471 vld1.64 {d5}, [r0], r1
472 vld1.64 {d27}, [r0], r1
473
474 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
475
476 align_push_regs
477
478 h264_loop_filter_luma
479
480 transpose_4x4 q4, q8, q0, q5
481
482 sub r0, r0, r1, lsl #4
483 add r0, r0, #2
484 vst1.32 {d8[0]}, [r0], r1
485 vst1.32 {d16[0]}, [r0], r1
486 vst1.32 {d0[0]}, [r0], r1
487 vst1.32 {d10[0]}, [r0], r1
488 vst1.32 {d8[1]}, [r0], r1
489 vst1.32 {d16[1]}, [r0], r1
490 vst1.32 {d0[1]}, [r0], r1
491 vst1.32 {d10[1]}, [r0], r1
492 vst1.32 {d9[0]}, [r0], r1
493 vst1.32 {d17[0]}, [r0], r1
494 vst1.32 {d1[0]}, [r0], r1
495 vst1.32 {d11[0]}, [r0], r1
496 vst1.32 {d9[1]}, [r0], r1
497 vst1.32 {d17[1]}, [r0], r1
498 vst1.32 {d1[1]}, [r0], r1
499 vst1.32 {d11[1]}, [r0], r1
500
501 align_pop_regs
502 bx lr
503 .endfunc
504
505 .macro h264_loop_filter_chroma
506 vdup.8 d22, r2 @ alpha
507 vmovl.u8 q12, d24
508 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
509 vmovl.u8 q2, d0
510 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
511 vsubw.u8 q2, q2, d16
512 vsli.16 d24, d24, #8
513 vshl.i16 q2, q2, #2
514 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
515 vaddw.u8 q2, q2, d18
516 vclt.u8 d26, d26, d22 @ < alpha
517 vsubw.u8 q2, q2, d2
518 vdup.8 d22, r3 @ beta
519 vclt.s8 d25, d24, #0
520 vrshrn.i16 d4, q2, #3
521 vclt.u8 d28, d28, d22 @ < beta
522 vbic d26, d26, d25
523 vclt.u8 d30, d30, d22 @ < beta
524 vand d26, d26, d28
525 vneg.s8 d25, d24
526 vand d26, d26, d30
527 vmin.s8 d4, d4, d24
528 vmovl.u8 q14, d16
529 vand d4, d4, d26
530 vmax.s8 d4, d4, d25
531 vmovl.u8 q11, d0
532 vaddw.s8 q14, q14, d4
533 vsubw.s8 q11, q11, d4
534 vqmovun.s16 d16, q14
535 vqmovun.s16 d0, q11
536 .endm
537
538 function ff_h264_v_loop_filter_chroma_neon, export=1
539 h264_loop_filter_start
540
541 sub r0, r0, r1, lsl #1
542 vld1.64 {d18}, [r0,:64], r1
543 vld1.64 {d16}, [r0,:64], r1
544 vld1.64 {d0}, [r0,:64], r1
545 vld1.64 {d2}, [r0,:64]
546
547 h264_loop_filter_chroma
548
549 sub r0, r0, r1, lsl #1
550 vst1.64 {d16}, [r0,:64], r1
551 vst1.64 {d0}, [r0,:64], r1
552
553 bx lr
554 .endfunc
555
556 function ff_h264_h_loop_filter_chroma_neon, export=1
557 h264_loop_filter_start
558
559 sub r0, r0, #2
560 vld1.32 {d18[0]}, [r0], r1
561 vld1.32 {d16[0]}, [r0], r1
562 vld1.32 {d0[0]}, [r0], r1
563 vld1.32 {d2[0]}, [r0], r1
564 vld1.32 {d18[1]}, [r0], r1
565 vld1.32 {d16[1]}, [r0], r1
566 vld1.32 {d0[1]}, [r0], r1
567 vld1.32 {d2[1]}, [r0], r1
568
569 vtrn.16 d18, d0
570 vtrn.16 d16, d2
571 vtrn.8 d18, d16
572 vtrn.8 d0, d2
573
574 h264_loop_filter_chroma
575
576 vtrn.16 d18, d0
577 vtrn.16 d16, d2
578 vtrn.8 d18, d16
579 vtrn.8 d0, d2
580
581 sub r0, r0, r1, lsl #3
582 vst1.32 {d18[0]}, [r0], r1
583 vst1.32 {d16[0]}, [r0], r1
584 vst1.32 {d0[0]}, [r0], r1
585 vst1.32 {d2[0]}, [r0], r1
586 vst1.32 {d18[1]}, [r0], r1
587 vst1.32 {d16[1]}, [r0], r1
588 vst1.32 {d0[1]}, [r0], r1
589 vst1.32 {d2[1]}, [r0], r1
590
591 bx lr
592 .endfunc
593
594 /* H.264 qpel MC */
595
596 .macro lowpass_const r
597 movw \r, #5
598 movt \r, #20
599 vmov.32 d6[0], \r
600 .endm
601
602 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
603 .if \narrow
604 t0 .req q0
605 t1 .req q8
606 .else
607 t0 .req \d0
608 t1 .req \d1
609 .endif
610 vext.8 d2, \r0, \r1, #2
611 vext.8 d3, \r0, \r1, #3
612 vaddl.u8 q1, d2, d3
613 vext.8 d4, \r0, \r1, #1
614 vext.8 d5, \r0, \r1, #4
615 vaddl.u8 q2, d4, d5
616 vext.8 d30, \r0, \r1, #5
617 vaddl.u8 t0, \r0, d30
618 vext.8 d18, \r2, \r3, #2
619 vmla.i16 t0, q1, d6[1]
620 vext.8 d19, \r2, \r3, #3
621 vaddl.u8 q9, d18, d19
622 vext.8 d20, \r2, \r3, #1
623 vmls.i16 t0, q2, d6[0]
624 vext.8 d21, \r2, \r3, #4
625 vaddl.u8 q10, d20, d21
626 vext.8 d31, \r2, \r3, #5
627 vaddl.u8 t1, \r2, d31
628 vmla.i16 t1, q9, d6[1]
629 vmls.i16 t1, q10, d6[0]
630 .if \narrow
631 vqrshrun.s16 \d0, t0, #5
632 vqrshrun.s16 \d1, t1, #5
633 .endif
634 .unreq t0
635 .unreq t1
636 .endm
637
638 .macro lowpass_8_1 r0, r1, d0, narrow=1
639 .if \narrow
640 t0 .req q0
641 .else
642 t0 .req \d0
643 .endif
644 vext.8 d2, \r0, \r1, #2
645 vext.8 d3, \r0, \r1, #3
646 vaddl.u8 q1, d2, d3
647 vext.8 d4, \r0, \r1, #1
648 vext.8 d5, \r0, \r1, #4
649 vaddl.u8 q2, d4, d5
650 vext.8 d30, \r0, \r1, #5
651 vaddl.u8 t0, \r0, d30
652 vmla.i16 t0, q1, d6[1]
653 vmls.i16 t0, q2, d6[0]
654 .if \narrow
655 vqrshrun.s16 \d0, t0, #5
656 .endif
657 .unreq t0
658 .endm
659
660 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
661 vext.16 q1, \r0, \r1, #2
662 vext.16 q0, \r0, \r1, #3
663 vaddl.s16 q9, d2, d0
664 vext.16 q2, \r0, \r1, #1
665 vaddl.s16 q1, d3, d1
666 vext.16 q3, \r0, \r1, #4
667 vaddl.s16 q10, d4, d6
668 vext.16 \r1, \r0, \r1, #5
669 vaddl.s16 q2, d5, d7
670 vaddl.s16 q0, \h0, \h1
671 vaddl.s16 q8, \l0, \l1
672
673 vshl.i32 q3, q9, #4
674 vshl.i32 q9, q9, #2
675 vshl.i32 q15, q10, #2
676 vadd.i32 q9, q9, q3
677 vadd.i32 q10, q10, q15
678
679 vshl.i32 q3, q1, #4
680 vshl.i32 q1, q1, #2
681 vshl.i32 q15, q2, #2
682 vadd.i32 q1, q1, q3
683 vadd.i32 q2, q2, q15
684
685 vadd.i32 q9, q9, q8
686 vsub.i32 q9, q9, q10
687
688 vadd.i32 q1, q1, q0
689 vsub.i32 q1, q1, q2
690
691 vrshrn.s32 d18, q9, #10
692 vrshrn.s32 d19, q1, #10
693
694 vqmovun.s16 \d, q9
695 .endm
696
697 function put_h264_qpel16_h_lowpass_neon_packed
698 mov r4, lr
699 mov ip, #16
700 mov r3, #8
701 bl put_h264_qpel8_h_lowpass_neon
702 sub r1, r1, r2, lsl #4
703 add r1, r1, #8
704 mov ip, #16
705 mov lr, r4
706 b put_h264_qpel8_h_lowpass_neon
707 .endfunc
708
709 function put_h264_qpel16_h_lowpass_neon
710 push {lr}
711 mov ip, #16
712 bl put_h264_qpel8_h_lowpass_neon
713 sub r0, r0, r3, lsl #4
714 sub r1, r1, r2, lsl #4
715 add r0, r0, #8
716 add r1, r1, #8
717 mov ip, #16
718 pop {lr}
719 .endfunc
720
721 function put_h264_qpel8_h_lowpass_neon
722 1: vld1.64 {d0, d1}, [r1], r2
723 vld1.64 {d16,d17}, [r1], r2
724 subs ip, ip, #2
725 lowpass_8 d0, d1, d16, d17, d0, d16
726 vst1.64 {d0}, [r0,:64], r3
727 vst1.64 {d16}, [r0,:64], r3
728 bne 1b
729 bx lr
730 .endfunc
731
732 function put_h264_qpel16_h_lowpass_l2_neon
733 push {lr}
734 mov ip, #16
735 bl put_h264_qpel8_h_lowpass_l2_neon
736 sub r0, r0, r2, lsl #4
737 sub r1, r1, r2, lsl #4
738 sub r3, r3, r2, lsl #4
739 add r0, r0, #8
740 add r1, r1, #8
741 add r3, r3, #8
742 mov ip, #16
743 pop {lr}
744 .endfunc
745
746 function put_h264_qpel8_h_lowpass_l2_neon
747 1: vld1.64 {d0, d1}, [r1], r2
748 vld1.64 {d16,d17}, [r1], r2
749 vld1.64 {d28}, [r3], r2
750 vld1.64 {d29}, [r3], r2
751 subs ip, ip, #2
752 lowpass_8 d0, d1, d16, d17, d0, d1
753 vrhadd.u8 q0, q0, q14
754 vst1.64 {d0}, [r0,:64], r2
755 vst1.64 {d1}, [r0,:64], r2
756 bne 1b
757 bx lr
758 .endfunc
759
760 function put_h264_qpel16_v_lowpass_neon_packed
761 mov r4, lr
762 mov r2, #8
763 bl put_h264_qpel8_v_lowpass_neon
764 sub r1, r1, r3, lsl #2
765 bl put_h264_qpel8_v_lowpass_neon
766 sub r1, r1, r3, lsl #4
767 sub r1, r1, r3, lsl #2
768 add r1, r1, #8
769 bl put_h264_qpel8_v_lowpass_neon
770 sub r1, r1, r3, lsl #2
771 mov lr, r4
772 b put_h264_qpel8_v_lowpass_neon
773 .endfunc
774
775 function put_h264_qpel16_v_lowpass_neon
776 mov r4, lr
777 bl put_h264_qpel8_v_lowpass_neon
778 sub r1, r1, r3, lsl #2
779 bl put_h264_qpel8_v_lowpass_neon
780 sub r0, r0, r2, lsl #4
781 add r0, r0, #8
782 sub r1, r1, r3, lsl #4
783 sub r1, r1, r3, lsl #2
784 add r1, r1, #8
785 bl put_h264_qpel8_v_lowpass_neon
786 sub r1, r1, r3, lsl #2
787 mov lr, r4
788 .endfunc
789
790 function put_h264_qpel8_v_lowpass_neon
791 vld1.64 {d8}, [r1], r3
792 vld1.64 {d10}, [r1], r3
793 vld1.64 {d12}, [r1], r3
794 vld1.64 {d14}, [r1], r3
795 vld1.64 {d22}, [r1], r3
796 vld1.64 {d24}, [r1], r3
797 vld1.64 {d26}, [r1], r3
798 vld1.64 {d28}, [r1], r3
799 vld1.64 {d9}, [r1], r3
800 vld1.64 {d11}, [r1], r3
801 vld1.64 {d13}, [r1], r3
802 vld1.64 {d15}, [r1], r3
803 vld1.64 {d23}, [r1]
804
805 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
806 lowpass_8 d8, d9, d10, d11, d8, d10
807 lowpass_8 d12, d13, d14, d15, d12, d14
808 lowpass_8 d22, d23, d24, d25, d22, d24
809 lowpass_8 d26, d27, d28, d29, d26, d28
810 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
811
812 vst1.64 {d8}, [r0,:64], r2
813 vst1.64 {d10}, [r0,:64], r2
814 vst1.64 {d12}, [r0,:64], r2
815 vst1.64 {d14}, [r0,:64], r2
816 vst1.64 {d22}, [r0,:64], r2
817 vst1.64 {d24}, [r0,:64], r2
818 vst1.64 {d26}, [r0,:64], r2
819 vst1.64 {d28}, [r0,:64], r2
820
821 bx lr
822 .endfunc
823
824 function put_h264_qpel16_v_lowpass_l2_neon
825 mov r4, lr
826 bl put_h264_qpel8_v_lowpass_l2_neon
827 sub r1, r1, r3, lsl #2
828 bl put_h264_qpel8_v_lowpass_l2_neon
829 sub r0, r0, r3, lsl #4
830 sub ip, ip, r2, lsl #4
831 add r0, r0, #8
832 add ip, ip, #8
833 sub r1, r1, r3, lsl #4
834 sub r1, r1, r3, lsl #2
835 add r1, r1, #8
836 bl put_h264_qpel8_v_lowpass_l2_neon
837 sub r1, r1, r3, lsl #2
838 mov lr, r4
839 .endfunc
840
841 function put_h264_qpel8_v_lowpass_l2_neon
842 vld1.64 {d8}, [r1], r3
843 vld1.64 {d10}, [r1], r3
844 vld1.64 {d12}, [r1], r3
845 vld1.64 {d14}, [r1], r3
846 vld1.64 {d22}, [r1], r3
847 vld1.64 {d24}, [r1], r3
848 vld1.64 {d26}, [r1], r3
849 vld1.64 {d28}, [r1], r3
850 vld1.64 {d9}, [r1], r3
851 vld1.64 {d11}, [r1], r3
852 vld1.64 {d13}, [r1], r3
853 vld1.64 {d15}, [r1], r3
854 vld1.64 {d23}, [r1]
855
856 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
857 lowpass_8 d8, d9, d10, d11, d8, d9
858 lowpass_8 d12, d13, d14, d15, d12, d13
859 lowpass_8 d22, d23, d24, d25, d22, d23
860 lowpass_8 d26, d27, d28, d29, d26, d27
861 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
862
863 vld1.64 {d0}, [ip], r2
864 vld1.64 {d1}, [ip], r2
865 vld1.64 {d2}, [ip], r2
866 vld1.64 {d3}, [ip], r2
867 vld1.64 {d4}, [ip], r2
868 vrhadd.u8 q0, q0, q4
869 vld1.64 {d5}, [ip], r2
870 vrhadd.u8 q1, q1, q6
871 vld1.64 {d10}, [ip], r2
872 vrhadd.u8 q2, q2, q11
873 vld1.64 {d11}, [ip], r2
874
875 vst1.64 {d0}, [r0,:64], r3
876 vst1.64 {d1}, [r0,:64], r3
877 vrhadd.u8 q5, q5, q13
878 vst1.64 {d2}, [r0,:64], r3
879 vst1.64 {d3}, [r0,:64], r3
880 vst1.64 {d4}, [r0,:64], r3
881 vst1.64 {d5}, [r0,:64], r3
882 vst1.64 {d10}, [r0,:64], r3
883 vst1.64 {d11}, [r0,:64], r3
884
885 bx lr
886 .endfunc
887
888 function put_h264_qpel8_hv_lowpass_neon_top
889 lowpass_const ip
890 mov ip, #12
891 1: vld1.64 {d0, d1}, [r1], r3
892 vld1.64 {d16,d17}, [r1], r3
893 subs ip, ip, #2
894 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
895 vst1.64 {d22-d25}, [r4,:128]!
896 bne 1b
897
898 vld1.64 {d0, d1}, [r1]
899 lowpass_8_1 d0, d1, q12, narrow=0
900
901 mov ip, #-16
902 add r4, r4, ip
903 vld1.64 {d30,d31}, [r4,:128], ip
904 vld1.64 {d20,d21}, [r4,:128], ip
905 vld1.64 {d18,d19}, [r4,:128], ip
906 vld1.64 {d16,d17}, [r4,:128], ip
907 vld1.64 {d14,d15}, [r4,:128], ip
908 vld1.64 {d12,d13}, [r4,:128], ip
909 vld1.64 {d10,d11}, [r4,:128], ip
910 vld1.64 {d8, d9}, [r4,:128], ip
911 vld1.64 {d6, d7}, [r4,:128], ip
912 vld1.64 {d4, d5}, [r4,:128], ip
913 vld1.64 {d2, d3}, [r4,:128], ip
914 vld1.64 {d0, d1}, [r4,:128]
915
916 swap4 d1, d3, d5, d7, d8, d10, d12, d14
917 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
918
919 swap4 d17, d19, d21, d31, d24, d26, d28, d22
920 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
921
922 vst1.64 {d30,d31}, [r4,:128]!
923 vst1.64 {d6, d7}, [r4,:128]!
924 vst1.64 {d20,d21}, [r4,:128]!
925 vst1.64 {d4, d5}, [r4,:128]!
926 vst1.64 {d18,d19}, [r4,:128]!
927 vst1.64 {d2, d3}, [r4,:128]!
928 vst1.64 {d16,d17}, [r4,:128]!
929 vst1.64 {d0, d1}, [r4,:128]
930
931 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
932 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
933 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
934 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
935
936 vld1.64 {d16,d17}, [r4,:128], ip
937 vld1.64 {d30,d31}, [r4,:128], ip
938 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
939 vld1.64 {d16,d17}, [r4,:128], ip
940 vld1.64 {d30,d31}, [r4,:128], ip
941 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
942 vld1.64 {d16,d17}, [r4,:128], ip
943 vld1.64 {d30,d31}, [r4,:128], ip
944 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
945 vld1.64 {d16,d17}, [r4,:128], ip
946 vld1.64 {d30,d31}, [r4,:128]
947 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
948
949 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
950
951 bx lr
952 .endfunc
953
954 function put_h264_qpel8_hv_lowpass_neon
955 mov r10, lr
956 bl put_h264_qpel8_hv_lowpass_neon_top
957 vst1.64 {d12}, [r0,:64], r2
958 vst1.64 {d13}, [r0,:64], r2
959 vst1.64 {d14}, [r0,:64], r2
960 vst1.64 {d15}, [r0,:64], r2
961 vst1.64 {d8}, [r0,:64], r2
962 vst1.64 {d9}, [r0,:64], r2
963 vst1.64 {d10}, [r0,:64], r2
964 vst1.64 {d11}, [r0,:64], r2
965
966 mov lr, r10
967 bx lr
968 .endfunc
969
970 function put_h264_qpel8_hv_lowpass_l2_neon
971 mov r10, lr
972 bl put_h264_qpel8_hv_lowpass_neon_top
973
974 vld1.64 {d0, d1}, [r2,:128]!
975 vld1.64 {d2, d3}, [r2,:128]!
976 vrhadd.u8 q0, q0, q6
977 vld1.64 {d4, d5}, [r2,:128]!
978 vrhadd.u8 q1, q1, q7
979 vld1.64 {d6, d7}, [r2,:128]!
980 vrhadd.u8 q2, q2, q4
981
982 vst1.64 {d0}, [r0,:64], r3
983 vrhadd.u8 q3, q3, q5
984 vst1.64 {d1}, [r0,:64], r3
985 vst1.64 {d2}, [r0,:64], r3
986 vst1.64 {d3}, [r0,:64], r3
987 vst1.64 {d4}, [r0,:64], r3
988 vst1.64 {d5}, [r0,:64], r3
989 vst1.64 {d6}, [r0,:64], r3
990 vst1.64 {d7}, [r0,:64], r3
991
992 mov lr, r10
993 bx lr
994 .endfunc
995
996 function put_h264_qpel16_hv_lowpass_neon
997 mov r9, lr
998 bl put_h264_qpel8_hv_lowpass_neon
999 sub r1, r1, r3, lsl #2
1000 bl put_h264_qpel8_hv_lowpass_neon
1001 sub r1, r1, r3, lsl #4
1002 sub r1, r1, r3, lsl #2
1003 add r1, r1, #8
1004 sub r0, r0, r2, lsl #4
1005 add r0, r0, #8
1006 bl put_h264_qpel8_hv_lowpass_neon
1007 sub r1, r1, r3, lsl #2
1008 mov lr, r9
1009 b put_h264_qpel8_hv_lowpass_neon
1010 .endfunc
1011
1012 function put_h264_qpel16_hv_lowpass_l2_neon
1013 mov r9, lr
1014 sub r2, r4, #256
1015 bl put_h264_qpel8_hv_lowpass_l2_neon
1016 sub r1, r1, r3, lsl #2
1017 bl put_h264_qpel8_hv_lowpass_l2_neon
1018 sub r1, r1, r3, lsl #4
1019 sub r1, r1, r3, lsl #2
1020 add r1, r1, #8
1021 sub r0, r0, r3, lsl #4
1022 add r0, r0, #8
1023 bl put_h264_qpel8_hv_lowpass_l2_neon
1024 sub r1, r1, r3, lsl #2
1025 mov lr, r9
1026 b put_h264_qpel8_hv_lowpass_l2_neon
1027 .endfunc
1028
1029 function ff_put_h264_qpel8_mc10_neon, export=1
1030 lowpass_const r3
1031 mov r3, r1
1032 sub r1, r1, #2
1033 mov ip, #8
1034 b put_h264_qpel8_h_lowpass_l2_neon
1035 .endfunc
1036
1037 function ff_put_h264_qpel8_mc20_neon, export=1
1038 lowpass_const r3
1039 sub r1, r1, #2
1040 mov r3, r2
1041 mov ip, #8
1042 b put_h264_qpel8_h_lowpass_neon
1043 .endfunc
1044
1045 function ff_put_h264_qpel8_mc30_neon, export=1
1046 lowpass_const r3
1047 add r3, r1, #1
1048 sub r1, r1, #2
1049 mov ip, #8
1050 b put_h264_qpel8_h_lowpass_l2_neon
1051 .endfunc
1052
1053 function ff_put_h264_qpel8_mc01_neon, export=1
1054 push {lr}
1055 mov ip, r1
1056 put_h264_qpel8_mc01:
1057 lowpass_const r3
1058 mov r3, r2
1059 sub r1, r1, r2, lsl #1
1060 vpush {d8-d15}
1061 bl put_h264_qpel8_v_lowpass_l2_neon
1062 vpop {d8-d15}
1063 pop {pc}
1064 .endfunc
1065
1066 function ff_put_h264_qpel8_mc11_neon, export=1
1067 push {r0, r1, r11, lr}
1068 put_h264_qpel8_mc11:
1069 lowpass_const r3
1070 mov r11, sp
1071 bic sp, sp, #15
1072 sub sp, sp, #64
1073 mov r0, sp
1074 sub r1, r1, #2
1075 mov r3, #8
1076 mov ip, #8
1077 vpush {d8-d15}
1078 bl put_h264_qpel8_h_lowpass_neon
1079 ldrd r0, [r11]
1080 mov r3, r2
1081 add ip, sp, #64
1082 sub r1, r1, r2, lsl #1
1083 mov r2, #8
1084 bl put_h264_qpel8_v_lowpass_l2_neon
1085 vpop {d8-d15}
1086 add sp, r11, #8
1087 pop {r11, pc}
1088 .endfunc
1089
1090 function ff_put_h264_qpel8_mc21_neon, export=1
1091 push {r0, r1, r4, r10, r11, lr}
1092 put_h264_qpel8_mc21:
1093 lowpass_const r3
1094 mov r11, sp
1095 bic sp, sp, #15
1096 sub sp, sp, #(8*8+16*12)
1097 sub r1, r1, #2
1098 mov r3, #8
1099 mov r0, sp
1100 mov ip, #8
1101 vpush {d8-d15}
1102 bl put_h264_qpel8_h_lowpass_neon
1103 mov r4, r0
1104 ldrd r0, [r11]
1105 sub r1, r1, r2, lsl #1
1106 sub r1, r1, #2
1107 mov r3, r2
1108 sub r2, r4, #64
1109 bl put_h264_qpel8_hv_lowpass_l2_neon
1110 vpop {d8-d15}
1111 add sp, r11, #8
1112 pop {r4, r10, r11, pc}
1113 .endfunc
1114
1115 function ff_put_h264_qpel8_mc31_neon, export=1
1116 add r1, r1, #1
1117 push {r0, r1, r11, lr}
1118 sub r1, r1, #1
1119 b put_h264_qpel8_mc11
1120 .endfunc
1121
1122 function ff_put_h264_qpel8_mc02_neon, export=1
1123 push {lr}
1124 lowpass_const r3
1125 sub r1, r1, r2, lsl #1
1126 mov r3, r2
1127 vpush {d8-d15}
1128 bl put_h264_qpel8_v_lowpass_neon
1129 vpop {d8-d15}
1130 pop {pc}
1131 .endfunc
1132
1133 function ff_put_h264_qpel8_mc12_neon, export=1
1134 push {r0, r1, r4, r10, r11, lr}
1135 put_h264_qpel8_mc12:
1136 lowpass_const r3
1137 mov r11, sp
1138 bic sp, sp, #15
1139 sub sp, sp, #(8*8+16*12)
1140 sub r1, r1, r2, lsl #1
1141 mov r3, r2
1142 mov r2, #8
1143 mov r0, sp
1144 vpush {d8-d15}
1145 bl put_h264_qpel8_v_lowpass_neon
1146 mov r4, r0
1147 ldrd r0, [r11]
1148 sub r1, r1, r3, lsl #1
1149 sub r1, r1, #2
1150 sub r2, r4, #64
1151 bl put_h264_qpel8_hv_lowpass_l2_neon
1152 vpop {d8-d15}
1153 add sp, r11, #8
1154 pop {r4, r10, r11, pc}
1155 .endfunc
1156
1157 function ff_put_h264_qpel8_mc22_neon, export=1
1158 push {r4, r10, r11, lr}
1159 mov r11, sp
1160 bic sp, sp, #15
1161 sub r1, r1, r2, lsl #1
1162 sub r1, r1, #2
1163 mov r3, r2
1164 sub sp, sp, #(16*12)
1165 mov r4, sp
1166 vpush {d8-d15}
1167 bl put_h264_qpel8_hv_lowpass_neon
1168 vpop {d8-d15}
1169 mov sp, r11
1170 pop {r4, r10, r11, pc}
1171 .endfunc
1172
1173 function ff_put_h264_qpel8_mc32_neon, export=1
1174 push {r0, r1, r4, r10, r11, lr}
1175 add r1, r1, #1
1176 b put_h264_qpel8_mc12
1177 .endfunc
1178
1179 function ff_put_h264_qpel8_mc03_neon, export=1
1180 push {lr}
1181 add ip, r1, r2
1182 b put_h264_qpel8_mc01
1183 .endfunc
1184
1185 function ff_put_h264_qpel8_mc13_neon, export=1
1186 push {r0, r1, r11, lr}
1187 add r1, r1, r2
1188 b put_h264_qpel8_mc11
1189 .endfunc
1190
1191 function ff_put_h264_qpel8_mc23_neon, export=1
1192 push {r0, r1, r4, r10, r11, lr}
1193 add r1, r1, r2
1194 b put_h264_qpel8_mc21
1195 .endfunc
1196
1197 function ff_put_h264_qpel8_mc33_neon, export=1
1198 add r1, r1, #1
1199 push {r0, r1, r11, lr}
1200 add r1, r1, r2
1201 sub r1, r1, #1
1202 b put_h264_qpel8_mc11
1203 .endfunc
1204
1205 function ff_put_h264_qpel16_mc10_neon, export=1
1206 lowpass_const r3
1207 mov r3, r1
1208 sub r1, r1, #2
1209 b put_h264_qpel16_h_lowpass_l2_neon
1210 .endfunc
1211
1212 function ff_put_h264_qpel16_mc20_neon, export=1
1213 lowpass_const r3
1214 sub r1, r1, #2
1215 mov r3, r2
1216 b put_h264_qpel16_h_lowpass_neon
1217 .endfunc
1218
1219 function ff_put_h264_qpel16_mc30_neon, export=1
1220 lowpass_const r3
1221 add r3, r1, #1
1222 sub r1, r1, #2
1223 b put_h264_qpel16_h_lowpass_l2_neon
1224 .endfunc
1225
1226 function ff_put_h264_qpel16_mc01_neon, export=1
1227 push {r4, lr}
1228 mov ip, r1
1229 put_h264_qpel16_mc01:
1230 lowpass_const r3
1231 mov r3, r2
1232 sub r1, r1, r2, lsl #1
1233 vpush {d8-d15}
1234 bl put_h264_qpel16_v_lowpass_l2_neon
1235 vpop {d8-d15}
1236 pop {r4, pc}
1237 .endfunc
1238
1239 function ff_put_h264_qpel16_mc11_neon, export=1
1240 push {r0, r1, r4, r11, lr}
1241 put_h264_qpel16_mc11:
1242 lowpass_const r3
1243 mov r11, sp
1244 bic sp, sp, #15
1245 sub sp, sp, #256
1246 mov r0, sp
1247 sub r1, r1, #2
1248 mov r3, #16
1249 vpush {d8-d15}
1250 bl put_h264_qpel16_h_lowpass_neon
1251 ldrd r0, [r11]
1252 mov r3, r2
1253 add ip, sp, #64
1254 sub r1, r1, r2, lsl #1
1255 mov r2, #16
1256 bl put_h264_qpel16_v_lowpass_l2_neon
1257 vpop {d8-d15}
1258 add sp, r11, #8
1259 pop {r4, r11, pc}
1260 .endfunc
1261
1262 function ff_put_h264_qpel16_mc21_neon, export=1
1263 push {r0, r1, r4-r5, r9-r11, lr}
1264 put_h264_qpel16_mc21:
1265 lowpass_const r3
1266 mov r11, sp
1267 bic sp, sp, #15
1268 sub sp, sp, #(16*16+16*12)
1269 sub r1, r1, #2
1270 mov r0, sp
1271 vpush {d8-d15}
1272 bl put_h264_qpel16_h_lowpass_neon_packed
1273 mov r4, r0
1274 ldrd r0, [r11]
1275 sub r1, r1, r2, lsl #1
1276 sub r1, r1, #2
1277 mov r3, r2
1278 bl put_h264_qpel16_hv_lowpass_l2_neon
1279 vpop {d8-d15}
1280 add sp, r11, #8
1281 pop {r4-r5, r9-r11, pc}
1282 .endfunc
1283
1284 function ff_put_h264_qpel16_mc31_neon, export=1
1285 add r1, r1, #1
1286 push {r0, r1, r4, r11, lr}
1287 sub r1, r1, #1
1288 b put_h264_qpel16_mc11
1289 .endfunc
1290
1291 function ff_put_h264_qpel16_mc02_neon, export=1
1292 push {r4, lr}
1293 lowpass_const r3
1294 sub r1, r1, r2, lsl #1
1295 mov r3, r2
1296 vpush {d8-d15}
1297 bl put_h264_qpel16_v_lowpass_neon
1298 vpop {d8-d15}
1299 pop {r4, pc}
1300 .endfunc
1301
1302 function ff_put_h264_qpel16_mc12_neon, export=1
1303 push {r0, r1, r4-r5, r9-r11, lr}
1304 put_h264_qpel16_mc12:
1305 lowpass_const r3
1306 mov r11, sp
1307 bic sp, sp, #15
1308 sub sp, sp, #(16*16+16*12)
1309 sub r1, r1, r2, lsl #1
1310 mov r0, sp
1311 mov r3, r2
1312 vpush {d8-d15}
1313 bl put_h264_qpel16_v_lowpass_neon_packed
1314 mov r4, r0
1315 ldrd r0, [r11]
1316 sub r1, r1, r3, lsl #1
1317 sub r1, r1, #2
1318 mov r2, r3
1319 bl put_h264_qpel16_hv_lowpass_l2_neon
1320 vpop {d8-d15}
1321 add sp, r11, #8
1322 pop {r4-r5, r9-r11, pc}
1323 .endfunc
1324
1325 function ff_put_h264_qpel16_mc22_neon, export=1
1326 push {r4, r9-r11, lr}
1327 lowpass_const r3
1328 mov r11, sp
1329 bic sp, sp, #15
1330 sub r1, r1, r2, lsl #1
1331 sub r1, r1, #2
1332 mov r3, r2
1333 sub sp, sp, #(16*12)
1334 mov r4, sp
1335 vpush {d8-d15}
1336 bl put_h264_qpel16_hv_lowpass_neon
1337 vpop {d8-d15}
1338 mov sp, r11
1339 pop {r4, r9-r11, pc}
1340 .endfunc
1341
1342 function ff_put_h264_qpel16_mc32_neon, export=1
1343 push {r0, r1, r4-r5, r9-r11, lr}
1344 add r1, r1, #1
1345 b put_h264_qpel16_mc12
1346 .endfunc
1347
1348 function ff_put_h264_qpel16_mc03_neon, export=1
1349 push {r4, lr}
1350 add ip, r1, r2
1351 b put_h264_qpel16_mc01
1352 .endfunc
1353
1354 function ff_put_h264_qpel16_mc13_neon, export=1
1355 push {r0, r1, r4, r11, lr}
1356 add r1, r1, r2
1357 b put_h264_qpel16_mc11
1358 .endfunc
1359
1360 function ff_put_h264_qpel16_mc23_neon, export=1
1361 push {r0, r1, r4-r5, r9-r11, lr}
1362 add r1, r1, r2
1363 b put_h264_qpel16_mc21
1364 .endfunc
1365
1366 function ff_put_h264_qpel16_mc33_neon, export=1
1367 add r1, r1, #1
1368 push {r0, r1, r4, r11, lr}
1369 add r1, r1, r2
1370 sub r1, r1, #1
1371 b put_h264_qpel16_mc11
1372 .endfunc
1373
1374 @ Biweighted prediction
1375
1376 .macro biweight_16 macs, macd
1377 vdup.8 d0, r4
1378 vdup.8 d1, r5
1379 vmov q2, q8
1380 vmov q3, q8
1381 1: subs ip, ip, #2
1382 vld1.8 {d20-d21},[r0,:128], r2
1383 \macd q2, d0, d20
1384 pld [r0]
1385 \macd q3, d0, d21
1386 vld1.8 {d22-d23},[r1,:128], r2
1387 \macs q2, d1, d22
1388 pld [r1]
1389 \macs q3, d1, d23
1390 vmov q12, q8
1391 vld1.8 {d28-d29},[r0,:128], r2
1392 vmov q13, q8
1393 \macd q12, d0, d28
1394 pld [r0]
1395 \macd q13, d0, d29
1396 vld1.8 {d30-d31},[r1,:128], r2
1397 \macs q12, d1, d30
1398 pld [r1]
1399 \macs q13, d1, d31
1400 vshl.s16 q2, q2, q9
1401 vshl.s16 q3, q3, q9
1402 vqmovun.s16 d4, q2
1403 vqmovun.s16 d5, q3
1404 vshl.s16 q12, q12, q9
1405 vshl.s16 q13, q13, q9
1406 vqmovun.s16 d24, q12
1407 vqmovun.s16 d25, q13
1408 vmov q3, q8
1409 vst1.8 {d4- d5}, [r6,:128], r2
1410 vmov q2, q8
1411 vst1.8 {d24-d25},[r6,:128], r2
1412 bne 1b
1413 pop {r4-r6, pc}
1414 .endm
1415
1416 .macro biweight_8 macs, macd
1417 vdup.8 d0, r4
1418 vdup.8 d1, r5
1419 vmov q1, q8
1420 vmov q10, q8
1421 1: subs ip, ip, #2
1422 vld1.8 {d4},[r0,:64], r2
1423 \macd q1, d0, d4
1424 pld [r0]
1425 vld1.8 {d5},[r1,:64], r2
1426 \macs q1, d1, d5
1427 pld [r1]
1428 vld1.8 {d6},[r0,:64], r2
1429 \macd q10, d0, d6
1430 pld [r0]
1431 vld1.8 {d7},[r1,:64], r2
1432 \macs q10, d1, d7
1433 pld [r1]
1434 vshl.s16 q1, q1, q9
1435 vqmovun.s16 d2, q1
1436 vshl.s16 q10, q10, q9
1437 vqmovun.s16 d4, q10
1438 vmov q10, q8
1439 vst1.8 {d2},[r6,:64], r2
1440 vmov q1, q8
1441 vst1.8 {d4},[r6,:64], r2
1442 bne 1b
1443 pop {r4-r6, pc}
1444 .endm
1445
1446 .macro biweight_4 macs, macd
1447 vdup.8 d0, r4
1448 vdup.8 d1, r5
1449 vmov q1, q8
1450 vmov q10, q8
1451 1: subs ip, ip, #4
1452 vld1.32 {d4[0]},[r0,:32], r2
1453 vld1.32 {d4[1]},[r0,:32], r2
1454 \macd q1, d0, d4
1455 pld [r0]
1456 vld1.32 {d5[0]},[r1,:32], r2
1457 vld1.32 {d5[1]},[r1,:32], r2
1458 \macs q1, d1, d5
1459 pld [r1]
1460 blt 2f
1461 vld1.32 {d6[0]},[r0,:32], r2
1462 vld1.32 {d6[1]},[r0,:32], r2
1463 \macd q10, d0, d6
1464 pld [r0]
1465 vld1.32 {d7[0]},[r1,:32], r2
1466 vld1.32 {d7[1]},[r1,:32], r2
1467 \macs q10, d1, d7
1468 pld [r1]
1469 vshl.s16 q1, q1, q9
1470 vqmovun.s16 d2, q1
1471 vshl.s16 q10, q10, q9
1472 vqmovun.s16 d4, q10
1473 vmov q10, q8
1474 vst1.32 {d2[0]},[r6,:32], r2
1475 vst1.32 {d2[1]},[r6,:32], r2
1476 vmov q1, q8
1477 vst1.32 {d4[0]},[r6,:32], r2
1478 vst1.32 {d4[1]},[r6,:32], r2
1479 bne 1b
1480 pop {r4-r6, pc}
1481 2: vshl.s16 q1, q1, q9
1482 vqmovun.s16 d2, q1
1483 vst1.32 {d2[0]},[r6,:32], r2
1484 vst1.32 {d2[1]},[r6,:32], r2
1485 pop {r4-r6, pc}
1486 .endm
1487
1488 .macro biweight_func w
1489 function biweight_h264_pixels_\w\()_neon
1490 push {r4-r6, lr}
1491 add r4, sp, #16
1492 ldm r4, {r4-r6}
1493 lsr lr, r4, #31
1494 add r6, r6, #1
1495 eors lr, lr, r5, lsr #30
1496 orr r6, r6, #1
1497 vdup.16 q9, r3
1498 lsl r6, r6, r3
1499 vmvn q9, q9
1500 vdup.16 q8, r6
1501 mov r6, r0
1502 beq 10f
1503 subs lr, lr, #1
1504 beq 20f
1505 subs lr, lr, #1
1506 beq 30f
1507 b 40f
1508 10: biweight_\w vmlal.u8, vmlal.u8
1509 20: rsb r4, r4, #0
1510 biweight_\w vmlal.u8, vmlsl.u8
1511 30: rsb r4, r4, #0
1512 rsb r5, r5, #0
1513 biweight_\w vmlsl.u8, vmlsl.u8
1514 40: rsb r5, r5, #0
1515 biweight_\w vmlsl.u8, vmlal.u8
1516 .endfunc
1517 .endm
1518
1519 .macro biweight_entry w, h, b=1
1520 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1521 mov ip, #\h
1522 .if \b
1523 b biweight_h264_pixels_\w\()_neon
1524 .endif
1525 .endfunc
1526 .endm
1527
1528 biweight_entry 16, 8
1529 biweight_entry 16, 16, b=0
1530 biweight_func 16
1531
1532 biweight_entry 8, 16
1533 biweight_entry 8, 4
1534 biweight_entry 8, 8, b=0
1535 biweight_func 8
1536
1537 biweight_entry 4, 8
1538 biweight_entry 4, 2
1539 biweight_entry 4, 4, b=0
1540 biweight_func 4
1541
1542 @ Weighted prediction
1543
1544 .macro weight_16 add
1545 vdup.8 d0, r3
1546 1: subs ip, ip, #2
1547 vld1.8 {d20-d21},[r0,:128], r1
1548 vmull.u8 q2, d0, d20
1549 pld [r0]
1550 vmull.u8 q3, d0, d21
1551 vld1.8 {d28-d29},[r0,:128], r1
1552 vmull.u8 q12, d0, d28
1553 pld [r0]
1554 vmull.u8 q13, d0, d29
1555 \add q2, q8, q2
1556 vrshl.s16 q2, q2, q9
1557 \add q3, q8, q3
1558 vrshl.s16 q3, q3, q9
1559 vqmovun.s16 d4, q2
1560 vqmovun.s16 d5, q3
1561 \add q12, q8, q12
1562 vrshl.s16 q12, q12, q9
1563 \add q13, q8, q13
1564 vrshl.s16 q13, q13, q9
1565 vqmovun.s16 d24, q12
1566 vqmovun.s16 d25, q13
1567 vst1.8 {d4- d5}, [r4,:128], r1
1568 vst1.8 {d24-d25},[r4,:128], r1
1569 bne 1b
1570 pop {r4, pc}
1571 .endm
1572
1573 .macro weight_8 add
1574 vdup.8 d0, r3
1575 1: subs ip, ip, #2
1576 vld1.8 {d4},[r0,:64], r1
1577 vmull.u8 q1, d0, d4
1578 pld [r0]
1579 vld1.8 {d6},[r0,:64], r1
1580 vmull.u8 q10, d0, d6
1581 \add q1, q8, q1
1582 pld [r0]
1583 vrshl.s16 q1, q1, q9
1584 vqmovun.s16 d2, q1
1585 \add q10, q8, q10
1586 vrshl.s16 q10, q10, q9
1587 vqmovun.s16 d4, q10
1588 vst1.8 {d2},[r4,:64], r1
1589 vst1.8 {d4},[r4,:64], r1
1590 bne 1b
1591 pop {r4, pc}
1592 .endm
1593
1594 .macro weight_4 add
1595 vdup.8 d0, r3
1596 vmov q1, q8
1597 vmov q10, q8
1598 1: subs ip, ip, #4
1599 vld1.32 {d4[0]},[r0,:32], r1
1600 vld1.32 {d4[1]},[r0,:32], r1
1601 vmull.u8 q1, d0, d4
1602 pld [r0]
1603 blt 2f
1604 vld1.32 {d6[0]},[r0,:32], r1
1605 vld1.32 {d6[1]},[r0,:32], r1
1606 vmull.u8 q10, d0, d6
1607 pld [r0]
1608 \add q1, q8, q1
1609 vrshl.s16 q1, q1, q9
1610 vqmovun.s16 d2, q1
1611 \add q10, q8, q10
1612 vrshl.s16 q10, q10, q9
1613 vqmovun.s16 d4, q10
1614 vmov q10, q8
1615 vst1.32 {d2[0]},[r4,:32], r1
1616 vst1.32 {d2[1]},[r4,:32], r1
1617 vmov q1, q8
1618 vst1.32 {d4[0]},[r4,:32], r1
1619 vst1.32 {d4[1]},[r4,:32], r1
1620 bne 1b
1621 pop {r4, pc}
1622 2: \add q1, q8, q1
1623 vrshl.s16 q1, q1, q9
1624 vqmovun.s16 d2, q1
1625 vst1.32 {d2[0]},[r4,:32], r1
1626 vst1.32 {d2[1]},[r4,:32], r1
1627 pop {r4, pc}
1628 .endm
1629
1630 .macro weight_func w
1631 function weight_h264_pixels_\w\()_neon
1632 push {r4, lr}
1633 ldr r4, [sp, #8]
1634 cmp r2, #1
1635 lsl r4, r4, r2
1636 vdup.16 q8, r4
1637 mov r4, r0
1638 ble 20f
1639 rsb lr, r2, #1
1640 vdup.16 q9, lr
1641 cmp r3, #0
1642 blt 10f
1643 weight_\w vhadd.s16
1644 10: rsb r3, r3, #0
1645 weight_\w vhsub.s16
1646 20: rsb lr, r2, #0
1647 vdup.16 q9, lr
1648 cmp r3, #0
1649 blt 10f
1650 weight_\w vadd.s16
1651 10: rsb r3, r3, #0
1652 weight_\w vsub.s16
1653 .endfunc
1654 .endm
1655
1656 .macro weight_entry w, h, b=1
1657 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1658 mov ip, #\h
1659 .if \b
1660 b weight_h264_pixels_\w\()_neon
1661 .endif
1662 .endfunc
1663 .endm
1664
1665 weight_entry 16, 8
1666 weight_entry 16, 16, b=0
1667 weight_func 16
1668
1669 weight_entry 8, 16
1670 weight_entry 8, 4
1671 weight_entry 8, 8, b=0
1672 weight_func 8
1673
1674 weight_entry 4, 8
1675 weight_entry 4, 2
1676 weight_entry 4, 4, b=0
1677 weight_func 4