ARM: NEON 16x16 and 8x8 avg qpel MC
[libav.git] / libavcodec / arm / h264dsp_neon.S
CommitLineData
1cce897a
MR
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "asm.S"
22
5813e05d
MR
23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24 vtrn.32 \r0, \r4
25 vtrn.32 \r1, \r5
26 vtrn.32 \r2, \r6
27 vtrn.32 \r3, \r7
28 vtrn.16 \r0, \r2
29 vtrn.16 \r1, \r3
30 vtrn.16 \r4, \r6
31 vtrn.16 \r5, \r7
32 vtrn.8 \r0, \r1
33 vtrn.8 \r2, \r3
34 vtrn.8 \r4, \r5
35 vtrn.8 \r6, \r7
36 .endm
37
2da4e5e3
MR
38 .macro transpose_4x4 r0 r1 r2 r3
39 vtrn.16 \r0, \r2
40 vtrn.16 \r1, \r3
41 vtrn.8 \r0, \r1
42 vtrn.8 \r2, \r3
43 .endm
44
5813e05d
MR
45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46 vswp \r0, \r4
47 vswp \r1, \r5
48 vswp \r2, \r6
49 vswp \r3, \r7
50 .endm
51
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53 vtrn.32 \r0, \r2
54 vtrn.32 \r1, \r3
55 vtrn.32 \r4, \r6
56 vtrn.32 \r5, \r7
57 vtrn.16 \r0, \r1
58 vtrn.16 \r2, \r3
59 vtrn.16 \r4, \r5
60 vtrn.16 \r6, \r7
61 .endm
62
1cce897a 63/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
77c45373
MR
64 .macro h264_chroma_mc8 type
65function ff_\type\()_h264_chroma_mc8_neon, export=1
1cce897a
MR
66 push {r4-r7, lr}
67 ldrd r4, [sp, #20]
77c45373 68.ifc \type,avg
1cce897a
MR
69 mov lr, r0
70.endif
71 pld [r1]
72 pld [r1, r2]
73
74 muls r7, r4, r5
75 rsb r6, r7, r5, lsl #3
76 rsb ip, r7, r4, lsl #3
77 sub r4, r7, r4, lsl #3
78 sub r4, r4, r5, lsl #3
79 add r4, r4, #64
80
81 beq 2f
82
83 add r5, r1, r2
84
85 vdup.8 d0, r4
86 lsl r4, r2, #1
87 vdup.8 d1, ip
88 vld1.64 {d4, d5}, [r1], r4
89 vdup.8 d2, r6
90 vld1.64 {d6, d7}, [r5], r4
91 vdup.8 d3, r7
92
93 vext.8 d5, d4, d5, #1
94 vext.8 d7, d6, d7, #1
95
961: pld [r5]
97 vmull.u8 q8, d4, d0
98 vmlal.u8 q8, d5, d1
99 vld1.64 {d4, d5}, [r1], r4
100 vmlal.u8 q8, d6, d2
101 vext.8 d5, d4, d5, #1
102 vmlal.u8 q8, d7, d3
103 vmull.u8 q9, d6, d0
104 subs r3, r3, #2
105 vmlal.u8 q9, d7, d1
106 vmlal.u8 q9, d4, d2
107 vmlal.u8 q9, d5, d3
108 vrshrn.u16 d16, q8, #6
109 vld1.64 {d6, d7}, [r5], r4
110 pld [r1]
111 vrshrn.u16 d17, q9, #6
77c45373 112.ifc \type,avg
1cce897a
MR
113 vld1.64 {d20}, [lr,:64], r2
114 vld1.64 {d21}, [lr,:64], r2
115 vrhadd.u8 q8, q8, q10
116.endif
117 vext.8 d7, d6, d7, #1
118 vst1.64 {d16}, [r0,:64], r2
119 vst1.64 {d17}, [r0,:64], r2
120 bgt 1b
121
122 pop {r4-r7, pc}
123
1242: tst r6, r6
125 add ip, ip, r6
126 vdup.8 d0, r4
127 vdup.8 d1, ip
128
129 beq 4f
130
131 add r5, r1, r2
132 lsl r4, r2, #1
133 vld1.64 {d4}, [r1], r4
134 vld1.64 {d6}, [r5], r4
135
1363: pld [r5]
137 vmull.u8 q8, d4, d0
138 vmlal.u8 q8, d6, d1
139 vld1.64 {d4}, [r1], r4
140 vmull.u8 q9, d6, d0
141 vmlal.u8 q9, d4, d1
142 vld1.64 {d6}, [r5], r4
143 vrshrn.u16 d16, q8, #6
144 vrshrn.u16 d17, q9, #6
77c45373 145.ifc \type,avg
1cce897a
MR
146 vld1.64 {d20}, [lr,:64], r2
147 vld1.64 {d21}, [lr,:64], r2
148 vrhadd.u8 q8, q8, q10
149.endif
150 subs r3, r3, #2
151 pld [r1]
152 vst1.64 {d16}, [r0,:64], r2
153 vst1.64 {d17}, [r0,:64], r2
154 bgt 3b
155
156 pop {r4-r7, pc}
157
1584: vld1.64 {d4, d5}, [r1], r2
159 vld1.64 {d6, d7}, [r1], r2
160 vext.8 d5, d4, d5, #1
161 vext.8 d7, d6, d7, #1
162
1635: pld [r1]
164 subs r3, r3, #2
165 vmull.u8 q8, d4, d0
166 vmlal.u8 q8, d5, d1
167 vld1.64 {d4, d5}, [r1], r2
168 vmull.u8 q9, d6, d0
169 vmlal.u8 q9, d7, d1
170 pld [r1]
171 vext.8 d5, d4, d5, #1
172 vrshrn.u16 d16, q8, #6
173 vrshrn.u16 d17, q9, #6
77c45373 174.ifc \type,avg
1cce897a
MR
175 vld1.64 {d20}, [lr,:64], r2
176 vld1.64 {d21}, [lr,:64], r2
177 vrhadd.u8 q8, q8, q10
178.endif
179 vld1.64 {d6, d7}, [r1], r2
180 vext.8 d7, d6, d7, #1
181 vst1.64 {d16}, [r0,:64], r2
182 vst1.64 {d17}, [r0,:64], r2
183 bgt 5b
184
185 pop {r4-r7, pc}
77c45373 186 .endfunc
1cce897a
MR
187 .endm
188
189/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
77c45373
MR
190 .macro h264_chroma_mc4 type
191function ff_\type\()_h264_chroma_mc4_neon, export=1
1cce897a
MR
192 push {r4-r7, lr}
193 ldrd r4, [sp, #20]
77c45373 194.ifc \type,avg
1cce897a
MR
195 mov lr, r0
196.endif
197 pld [r1]
198 pld [r1, r2]
199
200 muls r7, r4, r5
201 rsb r6, r7, r5, lsl #3
202 rsb ip, r7, r4, lsl #3
203 sub r4, r7, r4, lsl #3
204 sub r4, r4, r5, lsl #3
205 add r4, r4, #64
206
207 beq 2f
208
209 add r5, r1, r2
210
211 vdup.8 d0, r4
212 lsl r4, r2, #1
213 vdup.8 d1, ip
214 vld1.64 {d4}, [r1], r4
215 vdup.8 d2, r6
216 vld1.64 {d6}, [r5], r4
217 vdup.8 d3, r7
218
219 vext.8 d5, d4, d5, #1
220 vext.8 d7, d6, d7, #1
221 vtrn.32 d4, d5
222 vtrn.32 d6, d7
223
224 vtrn.32 d0, d1
225 vtrn.32 d2, d3
226
2271: pld [r5]
228 vmull.u8 q8, d4, d0
229 vmlal.u8 q8, d6, d2
230 vld1.64 {d4}, [r1], r4
231 vext.8 d5, d4, d5, #1
232 vtrn.32 d4, d5
233 vmull.u8 q9, d6, d0
234 vmlal.u8 q9, d4, d2
235 vld1.64 {d6}, [r5], r4
236 vadd.i16 d16, d16, d17
237 vadd.i16 d17, d18, d19
238 vrshrn.u16 d16, q8, #6
239 subs r3, r3, #2
240 pld [r1]
77c45373 241.ifc \type,avg
1cce897a
MR
242 vld1.32 {d20[0]}, [lr,:32], r2
243 vld1.32 {d20[1]}, [lr,:32], r2
244 vrhadd.u8 d16, d16, d20
245.endif
246 vext.8 d7, d6, d7, #1
247 vtrn.32 d6, d7
248 vst1.32 {d16[0]}, [r0,:32], r2
249 vst1.32 {d16[1]}, [r0,:32], r2
250 bgt 1b
251
252 pop {r4-r7, pc}
253
2542: tst r6, r6
255 add ip, ip, r6
256 vdup.8 d0, r4
257 vdup.8 d1, ip
258 vtrn.32 d0, d1
259
260 beq 4f
261
262 vext.32 d1, d0, d1, #1
263 add r5, r1, r2
264 lsl r4, r2, #1
265 vld1.32 {d4[0]}, [r1], r4
266 vld1.32 {d4[1]}, [r5], r4
267
2683: pld [r5]
269 vmull.u8 q8, d4, d0
270 vld1.32 {d4[0]}, [r1], r4
271 vmull.u8 q9, d4, d1
272 vld1.32 {d4[1]}, [r5], r4
273 vadd.i16 d16, d16, d17
274 vadd.i16 d17, d18, d19
275 vrshrn.u16 d16, q8, #6
77c45373 276.ifc \type,avg
1cce897a
MR
277 vld1.32 {d20[0]}, [lr,:32], r2
278 vld1.32 {d20[1]}, [lr,:32], r2
279 vrhadd.u8 d16, d16, d20
280.endif
281 subs r3, r3, #2
282 pld [r1]
283 vst1.32 {d16[0]}, [r0,:32], r2
284 vst1.32 {d16[1]}, [r0,:32], r2
285 bgt 3b
286
287 pop {r4-r7, pc}
288
2894: vld1.64 {d4}, [r1], r2
290 vld1.64 {d6}, [r1], r2
291 vext.8 d5, d4, d5, #1
292 vext.8 d7, d6, d7, #1
293 vtrn.32 d4, d5
294 vtrn.32 d6, d7
295
2965: vmull.u8 q8, d4, d0
297 vmull.u8 q9, d6, d0
298 subs r3, r3, #2
299 vld1.64 {d4}, [r1], r2
300 vext.8 d5, d4, d5, #1
301 vtrn.32 d4, d5
302 vadd.i16 d16, d16, d17
303 vadd.i16 d17, d18, d19
304 pld [r1]
305 vrshrn.u16 d16, q8, #6
77c45373 306.ifc \type,avg
1cce897a
MR
307 vld1.32 {d20[0]}, [lr,:32], r2
308 vld1.32 {d20[1]}, [lr,:32], r2
309 vrhadd.u8 d16, d16, d20
310.endif
311 vld1.64 {d6}, [r1], r2
312 vext.8 d7, d6, d7, #1
313 vtrn.32 d6, d7
314 pld [r1]
315 vst1.32 {d16[0]}, [r0,:32], r2
316 vst1.32 {d16[1]}, [r0,:32], r2
317 bgt 5b
318
319 pop {r4-r7, pc}
77c45373 320 .endfunc
1cce897a
MR
321 .endm
322
323 .text
324 .align
325
77c45373
MR
326 h264_chroma_mc8 put
327 h264_chroma_mc8 avg
328 h264_chroma_mc4 put
329 h264_chroma_mc4 avg
ad74a0f8
MR
330
331 /* H.264 loop filter */
332
333 .macro h264_loop_filter_start
334 ldr ip, [sp]
335 tst r2, r2
336 ldr ip, [ip]
337 tstne r3, r3
338 vmov.32 d24[0], ip
339 and ip, ip, ip, lsl #16
340 bxeq lr
341 ands ip, ip, ip, lsl #8
342 bxlt lr
343 .endm
344
345 .macro align_push_regs
346 and ip, sp, #15
347 add ip, ip, #32
348 sub sp, sp, ip
349 vst1.64 {d12-d15}, [sp,:128]
350 sub sp, sp, #32
351 vst1.64 {d8-d11}, [sp,:128]
352 .endm
353
354 .macro align_pop_regs
355 vld1.64 {d8-d11}, [sp,:128]!
356 vld1.64 {d12-d15}, [sp,:128], ip
357 .endm
358
359 .macro h264_loop_filter_luma
360 vdup.8 q11, r2 @ alpha
361 vmovl.u8 q12, d24
362 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
363 vmovl.u16 q12, d24
364 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
365 vsli.16 q12, q12, #8
366 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
367 vsli.32 q12, q12, #16
368 vclt.u8 q6, q6, q11 @ < alpha
369 vdup.8 q11, r3 @ beta
370 vclt.s8 q7, q12, #0
371 vclt.u8 q14, q14, q11 @ < beta
372 vclt.u8 q15, q15, q11 @ < beta
373 vbic q6, q6, q7
374 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
375 vand q6, q6, q14
376 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
377 vclt.u8 q4, q4, q11 @ < beta
378 vand q6, q6, q15
379 vclt.u8 q5, q5, q11 @ < beta
380 vand q4, q4, q6
381 vand q5, q5, q6
382 vand q12, q12, q6
383 vrhadd.u8 q14, q8, q0
384 vsub.i8 q6, q12, q4
385 vqadd.u8 q7, q9, q12
386 vhadd.u8 q10, q10, q14
387 vsub.i8 q6, q6, q5
388 vhadd.u8 q14, q2, q14
389 vmin.u8 q7, q7, q10
390 vqsub.u8 q11, q9, q12
391 vqadd.u8 q2, q1, q12
392 vmax.u8 q7, q7, q11
393 vqsub.u8 q11, q1, q12
394 vmin.u8 q14, q2, q14
395 vmovl.u8 q2, d0
396 vmax.u8 q14, q14, q11
397 vmovl.u8 q10, d1
398 vsubw.u8 q2, q2, d16
399 vsubw.u8 q10, q10, d17
400 vshl.i16 q2, q2, #2
401 vshl.i16 q10, q10, #2
402 vaddw.u8 q2, q2, d18
403 vaddw.u8 q10, q10, d19
404 vsubw.u8 q2, q2, d2
405 vsubw.u8 q10, q10, d3
406 vrshrn.i16 d4, q2, #3
407 vrshrn.i16 d5, q10, #3
408 vbsl q4, q7, q9
409 vbsl q5, q14, q1
410 vneg.s8 q7, q6
411 vmovl.u8 q14, d16
412 vmin.s8 q2, q2, q6
413 vmovl.u8 q6, d17
414 vmax.s8 q2, q2, q7
415 vmovl.u8 q11, d0
416 vmovl.u8 q12, d1
417 vaddw.s8 q14, q14, d4
418 vaddw.s8 q6, q6, d5
419 vsubw.s8 q11, q11, d4
420 vsubw.s8 q12, q12, d5
421 vqmovun.s16 d16, q14
422 vqmovun.s16 d17, q6
423 vqmovun.s16 d0, q11
424 vqmovun.s16 d1, q12
425 .endm
426
427function ff_h264_v_loop_filter_luma_neon, export=1
428 h264_loop_filter_start
429
430 vld1.64 {d0, d1}, [r0,:128], r1
431 vld1.64 {d2, d3}, [r0,:128], r1
432 vld1.64 {d4, d5}, [r0,:128], r1
433 sub r0, r0, r1, lsl #2
434 sub r0, r0, r1, lsl #1
435 vld1.64 {d20,d21}, [r0,:128], r1
436 vld1.64 {d18,d19}, [r0,:128], r1
437 vld1.64 {d16,d17}, [r0,:128], r1
438
439 align_push_regs
440
441 h264_loop_filter_luma
442
443 sub r0, r0, r1, lsl #1
444 vst1.64 {d8, d9}, [r0,:128], r1
445 vst1.64 {d16,d17}, [r0,:128], r1
446 vst1.64 {d0, d1}, [r0,:128], r1
447 vst1.64 {d10,d11}, [r0,:128]
448
449 align_pop_regs
450 bx lr
451 .endfunc
452
453function ff_h264_h_loop_filter_luma_neon, export=1
454 h264_loop_filter_start
455
456 sub r0, r0, #4
457 vld1.64 {d6}, [r0], r1
458 vld1.64 {d20}, [r0], r1
459 vld1.64 {d18}, [r0], r1
460 vld1.64 {d16}, [r0], r1
461 vld1.64 {d0}, [r0], r1
462 vld1.64 {d2}, [r0], r1
463 vld1.64 {d4}, [r0], r1
464 vld1.64 {d26}, [r0], r1
465 vld1.64 {d7}, [r0], r1
466 vld1.64 {d21}, [r0], r1
467 vld1.64 {d19}, [r0], r1
468 vld1.64 {d17}, [r0], r1
469 vld1.64 {d1}, [r0], r1
470 vld1.64 {d3}, [r0], r1
471 vld1.64 {d5}, [r0], r1
472 vld1.64 {d27}, [r0], r1
473
5813e05d 474 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
ad74a0f8
MR
475
476 align_push_regs
ad74a0f8
MR
477
478 h264_loop_filter_luma
479
2da4e5e3 480 transpose_4x4 q4, q8, q0, q5
ad74a0f8
MR
481
482 sub r0, r0, r1, lsl #4
2da4e5e3
MR
483 add r0, r0, #2
484 vst1.32 {d8[0]}, [r0], r1
485 vst1.32 {d16[0]}, [r0], r1
486 vst1.32 {d0[0]}, [r0], r1
487 vst1.32 {d10[0]}, [r0], r1
488 vst1.32 {d8[1]}, [r0], r1
489 vst1.32 {d16[1]}, [r0], r1
490 vst1.32 {d0[1]}, [r0], r1
491 vst1.32 {d10[1]}, [r0], r1
492 vst1.32 {d9[0]}, [r0], r1
493 vst1.32 {d17[0]}, [r0], r1
494 vst1.32 {d1[0]}, [r0], r1
495 vst1.32 {d11[0]}, [r0], r1
496 vst1.32 {d9[1]}, [r0], r1
497 vst1.32 {d17[1]}, [r0], r1
498 vst1.32 {d1[1]}, [r0], r1
499 vst1.32 {d11[1]}, [r0], r1
ad74a0f8
MR
500
501 align_pop_regs
502 bx lr
503 .endfunc
504
505 .macro h264_loop_filter_chroma
506 vdup.8 d22, r2 @ alpha
507 vmovl.u8 q12, d24
508 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
509 vmovl.u8 q2, d0
510 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
511 vsubw.u8 q2, q2, d16
512 vsli.16 d24, d24, #8
513 vshl.i16 q2, q2, #2
514 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
515 vaddw.u8 q2, q2, d18
516 vclt.u8 d26, d26, d22 @ < alpha
517 vsubw.u8 q2, q2, d2
518 vdup.8 d22, r3 @ beta
519 vclt.s8 d25, d24, #0
520 vrshrn.i16 d4, q2, #3
521 vclt.u8 d28, d28, d22 @ < beta
522 vbic d26, d26, d25
523 vclt.u8 d30, d30, d22 @ < beta
524 vand d26, d26, d28
525 vneg.s8 d25, d24
526 vand d26, d26, d30
527 vmin.s8 d4, d4, d24
528 vmovl.u8 q14, d16
529 vand d4, d4, d26
530 vmax.s8 d4, d4, d25
531 vmovl.u8 q11, d0
532 vaddw.s8 q14, q14, d4
533 vsubw.s8 q11, q11, d4
534 vqmovun.s16 d16, q14
535 vqmovun.s16 d0, q11
536 .endm
537
538function ff_h264_v_loop_filter_chroma_neon, export=1
539 h264_loop_filter_start
540
541 sub r0, r0, r1, lsl #1
542 vld1.64 {d18}, [r0,:64], r1
543 vld1.64 {d16}, [r0,:64], r1
544 vld1.64 {d0}, [r0,:64], r1
545 vld1.64 {d2}, [r0,:64]
546
547 h264_loop_filter_chroma
548
549 sub r0, r0, r1, lsl #1
550 vst1.64 {d16}, [r0,:64], r1
551 vst1.64 {d0}, [r0,:64], r1
552
553 bx lr
554 .endfunc
555
556function ff_h264_h_loop_filter_chroma_neon, export=1
557 h264_loop_filter_start
558
559 sub r0, r0, #2
560 vld1.32 {d18[0]}, [r0], r1
561 vld1.32 {d16[0]}, [r0], r1
562 vld1.32 {d0[0]}, [r0], r1
563 vld1.32 {d2[0]}, [r0], r1
564 vld1.32 {d18[1]}, [r0], r1
565 vld1.32 {d16[1]}, [r0], r1
566 vld1.32 {d0[1]}, [r0], r1
567 vld1.32 {d2[1]}, [r0], r1
568
569 vtrn.16 d18, d0
570 vtrn.16 d16, d2
571 vtrn.8 d18, d16
572 vtrn.8 d0, d2
573
574 h264_loop_filter_chroma
575
576 vtrn.16 d18, d0
577 vtrn.16 d16, d2
578 vtrn.8 d18, d16
579 vtrn.8 d0, d2
580
581 sub r0, r0, r1, lsl #3
582 vst1.32 {d18[0]}, [r0], r1
583 vst1.32 {d16[0]}, [r0], r1
584 vst1.32 {d0[0]}, [r0], r1
585 vst1.32 {d2[0]}, [r0], r1
586 vst1.32 {d18[1]}, [r0], r1
587 vst1.32 {d16[1]}, [r0], r1
588 vst1.32 {d0[1]}, [r0], r1
589 vst1.32 {d2[1]}, [r0], r1
590
591 bx lr
592 .endfunc
5813e05d
MR
593
594 /* H.264 qpel MC */
595
596 .macro lowpass_const r
597 movw \r, #5
598 movt \r, #20
599 vmov.32 d6[0], \r
600 .endm
601
602 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
603.if \narrow
604 t0 .req q0
605 t1 .req q8
606.else
607 t0 .req \d0
608 t1 .req \d1
609.endif
610 vext.8 d2, \r0, \r1, #2
611 vext.8 d3, \r0, \r1, #3
612 vaddl.u8 q1, d2, d3
613 vext.8 d4, \r0, \r1, #1
614 vext.8 d5, \r0, \r1, #4
615 vaddl.u8 q2, d4, d5
616 vext.8 d30, \r0, \r1, #5
617 vaddl.u8 t0, \r0, d30
618 vext.8 d18, \r2, \r3, #2
619 vmla.i16 t0, q1, d6[1]
620 vext.8 d19, \r2, \r3, #3
621 vaddl.u8 q9, d18, d19
622 vext.8 d20, \r2, \r3, #1
623 vmls.i16 t0, q2, d6[0]
624 vext.8 d21, \r2, \r3, #4
625 vaddl.u8 q10, d20, d21
626 vext.8 d31, \r2, \r3, #5
627 vaddl.u8 t1, \r2, d31
628 vmla.i16 t1, q9, d6[1]
629 vmls.i16 t1, q10, d6[0]
630.if \narrow
631 vqrshrun.s16 \d0, t0, #5
632 vqrshrun.s16 \d1, t1, #5
633.endif
634 .unreq t0
635 .unreq t1
636 .endm
637
638 .macro lowpass_8_1 r0, r1, d0, narrow=1
639.if \narrow
640 t0 .req q0
641.else
642 t0 .req \d0
643.endif
644 vext.8 d2, \r0, \r1, #2
645 vext.8 d3, \r0, \r1, #3
646 vaddl.u8 q1, d2, d3
647 vext.8 d4, \r0, \r1, #1
648 vext.8 d5, \r0, \r1, #4
649 vaddl.u8 q2, d4, d5
650 vext.8 d30, \r0, \r1, #5
651 vaddl.u8 t0, \r0, d30
652 vmla.i16 t0, q1, d6[1]
653 vmls.i16 t0, q2, d6[0]
654.if \narrow
655 vqrshrun.s16 \d0, t0, #5
656.endif
657 .unreq t0
658 .endm
659
660 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
661 vext.16 q1, \r0, \r1, #2
662 vext.16 q0, \r0, \r1, #3
663 vaddl.s16 q9, d2, d0
664 vext.16 q2, \r0, \r1, #1
665 vaddl.s16 q1, d3, d1
666 vext.16 q3, \r0, \r1, #4
667 vaddl.s16 q10, d4, d6
668 vext.16 \r1, \r0, \r1, #5
669 vaddl.s16 q2, d5, d7
670 vaddl.s16 q0, \h0, \h1
671 vaddl.s16 q8, \l0, \l1
672
673 vshl.i32 q3, q9, #4
674 vshl.i32 q9, q9, #2
675 vshl.i32 q15, q10, #2
676 vadd.i32 q9, q9, q3
677 vadd.i32 q10, q10, q15
678
679 vshl.i32 q3, q1, #4
680 vshl.i32 q1, q1, #2
681 vshl.i32 q15, q2, #2
682 vadd.i32 q1, q1, q3
683 vadd.i32 q2, q2, q15
684
685 vadd.i32 q9, q9, q8
686 vsub.i32 q9, q9, q10
687
688 vadd.i32 q1, q1, q0
689 vsub.i32 q1, q1, q2
690
691 vrshrn.s32 d18, q9, #10
692 vrshrn.s32 d19, q1, #10
693
694 vqmovun.s16 \d, q9
695 .endm
696
697function put_h264_qpel16_h_lowpass_neon_packed
698 mov r4, lr
699 mov ip, #16
700 mov r3, #8
701 bl put_h264_qpel8_h_lowpass_neon
702 sub r1, r1, r2, lsl #4
703 add r1, r1, #8
704 mov ip, #16
705 mov lr, r4
706 b put_h264_qpel8_h_lowpass_neon
707 .endfunc
708
04e7f6d2
MR
709 .macro h264_qpel_h_lowpass type
710function \type\()_h264_qpel16_h_lowpass_neon
5813e05d
MR
711 push {lr}
712 mov ip, #16
04e7f6d2 713 bl \type\()_h264_qpel8_h_lowpass_neon
5813e05d
MR
714 sub r0, r0, r3, lsl #4
715 sub r1, r1, r2, lsl #4
716 add r0, r0, #8
717 add r1, r1, #8
718 mov ip, #16
719 pop {lr}
720 .endfunc
721
04e7f6d2 722function \type\()_h264_qpel8_h_lowpass_neon
5813e05d
MR
7231: vld1.64 {d0, d1}, [r1], r2
724 vld1.64 {d16,d17}, [r1], r2
725 subs ip, ip, #2
726 lowpass_8 d0, d1, d16, d17, d0, d16
04e7f6d2
MR
727.ifc \type,avg
728 vld1.8 {d2}, [r0,:64], r3
729 vrhadd.u8 d0, d0, d2
730 vld1.8 {d3}, [r0,:64]
731 vrhadd.u8 d16, d16, d3
732 sub r0, r0, r3
733.endif
5813e05d
MR
734 vst1.64 {d0}, [r0,:64], r3
735 vst1.64 {d16}, [r0,:64], r3
736 bne 1b
737 bx lr
738 .endfunc
04e7f6d2
MR
739 .endm
740
741 h264_qpel_h_lowpass put
742 h264_qpel_h_lowpass avg
5813e05d 743
04e7f6d2
MR
744 .macro h264_qpel_h_lowpass_l2 type
745function \type\()_h264_qpel16_h_lowpass_l2_neon
5813e05d
MR
746 push {lr}
747 mov ip, #16
04e7f6d2 748 bl \type\()_h264_qpel8_h_lowpass_l2_neon
5813e05d
MR
749 sub r0, r0, r2, lsl #4
750 sub r1, r1, r2, lsl #4
751 sub r3, r3, r2, lsl #4
752 add r0, r0, #8
753 add r1, r1, #8
754 add r3, r3, #8
755 mov ip, #16
756 pop {lr}
757 .endfunc
758
04e7f6d2 759function \type\()_h264_qpel8_h_lowpass_l2_neon
5813e05d
MR
7601: vld1.64 {d0, d1}, [r1], r2
761 vld1.64 {d16,d17}, [r1], r2
762 vld1.64 {d28}, [r3], r2
763 vld1.64 {d29}, [r3], r2
764 subs ip, ip, #2
765 lowpass_8 d0, d1, d16, d17, d0, d1
766 vrhadd.u8 q0, q0, q14
04e7f6d2
MR
767.ifc \type,avg
768 vld1.8 {d2}, [r0,:64], r2
769 vrhadd.u8 d0, d0, d2
770 vld1.8 {d3}, [r0,:64]
771 vrhadd.u8 d1, d1, d3
772 sub r0, r0, r2
773.endif
5813e05d
MR
774 vst1.64 {d0}, [r0,:64], r2
775 vst1.64 {d1}, [r0,:64], r2
776 bne 1b
777 bx lr
778 .endfunc
04e7f6d2
MR
779 .endm
780
781 h264_qpel_h_lowpass_l2 put
782 h264_qpel_h_lowpass_l2 avg
5813e05d
MR
783
784function put_h264_qpel16_v_lowpass_neon_packed
785 mov r4, lr
786 mov r2, #8
787 bl put_h264_qpel8_v_lowpass_neon
788 sub r1, r1, r3, lsl #2
789 bl put_h264_qpel8_v_lowpass_neon
790 sub r1, r1, r3, lsl #4
791 sub r1, r1, r3, lsl #2
792 add r1, r1, #8
793 bl put_h264_qpel8_v_lowpass_neon
794 sub r1, r1, r3, lsl #2
795 mov lr, r4
796 b put_h264_qpel8_v_lowpass_neon
797 .endfunc
798
04e7f6d2
MR
799 .macro h264_qpel_v_lowpass type
800function \type\()_h264_qpel16_v_lowpass_neon
5813e05d 801 mov r4, lr
04e7f6d2 802 bl \type\()_h264_qpel8_v_lowpass_neon
5813e05d 803 sub r1, r1, r3, lsl #2
04e7f6d2 804 bl \type\()_h264_qpel8_v_lowpass_neon
5813e05d
MR
805 sub r0, r0, r2, lsl #4
806 add r0, r0, #8
807 sub r1, r1, r3, lsl #4
808 sub r1, r1, r3, lsl #2
809 add r1, r1, #8
04e7f6d2 810 bl \type\()_h264_qpel8_v_lowpass_neon
5813e05d
MR
811 sub r1, r1, r3, lsl #2
812 mov lr, r4
813 .endfunc
814
04e7f6d2 815function \type\()_h264_qpel8_v_lowpass_neon
5813e05d
MR
816 vld1.64 {d8}, [r1], r3
817 vld1.64 {d10}, [r1], r3
818 vld1.64 {d12}, [r1], r3
819 vld1.64 {d14}, [r1], r3
820 vld1.64 {d22}, [r1], r3
821 vld1.64 {d24}, [r1], r3
822 vld1.64 {d26}, [r1], r3
823 vld1.64 {d28}, [r1], r3
824 vld1.64 {d9}, [r1], r3
825 vld1.64 {d11}, [r1], r3
826 vld1.64 {d13}, [r1], r3
827 vld1.64 {d15}, [r1], r3
828 vld1.64 {d23}, [r1]
829
830 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
831 lowpass_8 d8, d9, d10, d11, d8, d10
832 lowpass_8 d12, d13, d14, d15, d12, d14
833 lowpass_8 d22, d23, d24, d25, d22, d24
834 lowpass_8 d26, d27, d28, d29, d26, d28
835 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
836
04e7f6d2
MR
837.ifc \type,avg
838 vld1.8 {d9}, [r0,:64], r2
839 vrhadd.u8 d8, d8, d9
840 vld1.8 {d11}, [r0,:64], r2
841 vrhadd.u8 d10, d10, d11
842 vld1.8 {d13}, [r0,:64], r2
843 vrhadd.u8 d12, d12, d13
844 vld1.8 {d15}, [r0,:64], r2
845 vrhadd.u8 d14, d14, d15
846 vld1.8 {d23}, [r0,:64], r2
847 vrhadd.u8 d22, d22, d23
848 vld1.8 {d25}, [r0,:64], r2
849 vrhadd.u8 d24, d24, d25
850 vld1.8 {d27}, [r0,:64], r2
851 vrhadd.u8 d26, d26, d27
852 vld1.8 {d29}, [r0,:64], r2
853 vrhadd.u8 d28, d28, d29
854 sub r0, r0, r2, lsl #3
855.endif
856
5813e05d
MR
857 vst1.64 {d8}, [r0,:64], r2
858 vst1.64 {d10}, [r0,:64], r2
859 vst1.64 {d12}, [r0,:64], r2
860 vst1.64 {d14}, [r0,:64], r2
861 vst1.64 {d22}, [r0,:64], r2
862 vst1.64 {d24}, [r0,:64], r2
863 vst1.64 {d26}, [r0,:64], r2
864 vst1.64 {d28}, [r0,:64], r2
865
866 bx lr
867 .endfunc
04e7f6d2
MR
868 .endm
869
870 h264_qpel_v_lowpass put
871 h264_qpel_v_lowpass avg
5813e05d 872
04e7f6d2
MR
873 .macro h264_qpel_v_lowpass_l2 type
874function \type\()_h264_qpel16_v_lowpass_l2_neon
5813e05d 875 mov r4, lr
04e7f6d2 876 bl \type\()_h264_qpel8_v_lowpass_l2_neon
5813e05d 877 sub r1, r1, r3, lsl #2
04e7f6d2 878 bl \type\()_h264_qpel8_v_lowpass_l2_neon
5813e05d
MR
879 sub r0, r0, r3, lsl #4
880 sub ip, ip, r2, lsl #4
881 add r0, r0, #8
882 add ip, ip, #8
883 sub r1, r1, r3, lsl #4
884 sub r1, r1, r3, lsl #2
885 add r1, r1, #8
04e7f6d2 886 bl \type\()_h264_qpel8_v_lowpass_l2_neon
5813e05d
MR
887 sub r1, r1, r3, lsl #2
888 mov lr, r4
889 .endfunc
890
04e7f6d2 891function \type\()_h264_qpel8_v_lowpass_l2_neon
5813e05d
MR
892 vld1.64 {d8}, [r1], r3
893 vld1.64 {d10}, [r1], r3
894 vld1.64 {d12}, [r1], r3
895 vld1.64 {d14}, [r1], r3
896 vld1.64 {d22}, [r1], r3
897 vld1.64 {d24}, [r1], r3
898 vld1.64 {d26}, [r1], r3
899 vld1.64 {d28}, [r1], r3
900 vld1.64 {d9}, [r1], r3
901 vld1.64 {d11}, [r1], r3
902 vld1.64 {d13}, [r1], r3
903 vld1.64 {d15}, [r1], r3
904 vld1.64 {d23}, [r1]
905
906 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
907 lowpass_8 d8, d9, d10, d11, d8, d9
908 lowpass_8 d12, d13, d14, d15, d12, d13
909 lowpass_8 d22, d23, d24, d25, d22, d23
910 lowpass_8 d26, d27, d28, d29, d26, d27
911 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
912
913 vld1.64 {d0}, [ip], r2
914 vld1.64 {d1}, [ip], r2
915 vld1.64 {d2}, [ip], r2
916 vld1.64 {d3}, [ip], r2
917 vld1.64 {d4}, [ip], r2
918 vrhadd.u8 q0, q0, q4
919 vld1.64 {d5}, [ip], r2
920 vrhadd.u8 q1, q1, q6
921 vld1.64 {d10}, [ip], r2
922 vrhadd.u8 q2, q2, q11
923 vld1.64 {d11}, [ip], r2
04e7f6d2
MR
924 vrhadd.u8 q5, q5, q13
925
926.ifc \type,avg
927 vld1.8 {d16}, [r0,:64], r3
928 vrhadd.u8 d0, d0, d16
929 vld1.8 {d17}, [r0,:64], r3
930 vrhadd.u8 d1, d1, d17
931 vld1.8 {d16}, [r0,:64], r3
932 vrhadd.u8 d2, d2, d16
933 vld1.8 {d17}, [r0,:64], r3
934 vrhadd.u8 d3, d3, d17
935 vld1.8 {d16}, [r0,:64], r3
936 vrhadd.u8 d4, d4, d16
937 vld1.8 {d17}, [r0,:64], r3
938 vrhadd.u8 d5, d5, d17
939 vld1.8 {d16}, [r0,:64], r3
940 vrhadd.u8 d10, d10, d16
941 vld1.8 {d17}, [r0,:64], r3
942 vrhadd.u8 d11, d11, d17
943 sub r0, r0, r3, lsl #3
944.endif
5813e05d
MR
945
946 vst1.64 {d0}, [r0,:64], r3
947 vst1.64 {d1}, [r0,:64], r3
5813e05d
MR
948 vst1.64 {d2}, [r0,:64], r3
949 vst1.64 {d3}, [r0,:64], r3
950 vst1.64 {d4}, [r0,:64], r3
951 vst1.64 {d5}, [r0,:64], r3
952 vst1.64 {d10}, [r0,:64], r3
953 vst1.64 {d11}, [r0,:64], r3
954
955 bx lr
956 .endfunc
04e7f6d2
MR
957 .endm
958
959 h264_qpel_v_lowpass_l2 put
960 h264_qpel_v_lowpass_l2 avg
5813e05d
MR
961
962function put_h264_qpel8_hv_lowpass_neon_top
963 lowpass_const ip
964 mov ip, #12
9651: vld1.64 {d0, d1}, [r1], r3
966 vld1.64 {d16,d17}, [r1], r3
967 subs ip, ip, #2
968 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
969 vst1.64 {d22-d25}, [r4,:128]!
970 bne 1b
971
972 vld1.64 {d0, d1}, [r1]
973 lowpass_8_1 d0, d1, q12, narrow=0
974
975 mov ip, #-16
976 add r4, r4, ip
977 vld1.64 {d30,d31}, [r4,:128], ip
978 vld1.64 {d20,d21}, [r4,:128], ip
979 vld1.64 {d18,d19}, [r4,:128], ip
980 vld1.64 {d16,d17}, [r4,:128], ip
981 vld1.64 {d14,d15}, [r4,:128], ip
982 vld1.64 {d12,d13}, [r4,:128], ip
983 vld1.64 {d10,d11}, [r4,:128], ip
984 vld1.64 {d8, d9}, [r4,:128], ip
985 vld1.64 {d6, d7}, [r4,:128], ip
986 vld1.64 {d4, d5}, [r4,:128], ip
987 vld1.64 {d2, d3}, [r4,:128], ip
988 vld1.64 {d0, d1}, [r4,:128]
989
990 swap4 d1, d3, d5, d7, d8, d10, d12, d14
991 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
992
993 swap4 d17, d19, d21, d31, d24, d26, d28, d22
994 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
995
996 vst1.64 {d30,d31}, [r4,:128]!
997 vst1.64 {d6, d7}, [r4,:128]!
998 vst1.64 {d20,d21}, [r4,:128]!
999 vst1.64 {d4, d5}, [r4,:128]!
1000 vst1.64 {d18,d19}, [r4,:128]!
1001 vst1.64 {d2, d3}, [r4,:128]!
1002 vst1.64 {d16,d17}, [r4,:128]!
1003 vst1.64 {d0, d1}, [r4,:128]
1004
1005 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
1006 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
1007 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
1008 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
1009
1010 vld1.64 {d16,d17}, [r4,:128], ip
1011 vld1.64 {d30,d31}, [r4,:128], ip
1012 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
1013 vld1.64 {d16,d17}, [r4,:128], ip
1014 vld1.64 {d30,d31}, [r4,:128], ip
1015 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
1016 vld1.64 {d16,d17}, [r4,:128], ip
1017 vld1.64 {d30,d31}, [r4,:128], ip
1018 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
1019 vld1.64 {d16,d17}, [r4,:128], ip
1020 vld1.64 {d30,d31}, [r4,:128]
1021 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
1022
1023 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
1024
1025 bx lr
1026 .endfunc
1027
04e7f6d2
MR
1028 .macro h264_qpel8_hv_lowpass type
1029function \type\()_h264_qpel8_hv_lowpass_neon
5813e05d
MR
1030 mov r10, lr
1031 bl put_h264_qpel8_hv_lowpass_neon_top
04e7f6d2
MR
1032.ifc \type,avg
1033 vld1.8 {d0}, [r0,:64], r2
1034 vrhadd.u8 d12, d12, d0
1035 vld1.8 {d1}, [r0,:64], r2
1036 vrhadd.u8 d13, d13, d1
1037 vld1.8 {d2}, [r0,:64], r2
1038 vrhadd.u8 d14, d14, d2
1039 vld1.8 {d3}, [r0,:64], r2
1040 vrhadd.u8 d15, d15, d3
1041 vld1.8 {d4}, [r0,:64], r2
1042 vrhadd.u8 d8, d8, d4
1043 vld1.8 {d5}, [r0,:64], r2
1044 vrhadd.u8 d9, d9, d5
1045 vld1.8 {d6}, [r0,:64], r2
1046 vrhadd.u8 d10, d10, d6
1047 vld1.8 {d7}, [r0,:64], r2
1048 vrhadd.u8 d11, d11, d7
1049 sub r0, r0, r2, lsl #3
1050.endif
5813e05d
MR
1051 vst1.64 {d12}, [r0,:64], r2
1052 vst1.64 {d13}, [r0,:64], r2
1053 vst1.64 {d14}, [r0,:64], r2
1054 vst1.64 {d15}, [r0,:64], r2
1055 vst1.64 {d8}, [r0,:64], r2
1056 vst1.64 {d9}, [r0,:64], r2
1057 vst1.64 {d10}, [r0,:64], r2
1058 vst1.64 {d11}, [r0,:64], r2
1059
1060 mov lr, r10
1061 bx lr
1062 .endfunc
04e7f6d2
MR
1063 .endm
1064
1065 h264_qpel8_hv_lowpass put
1066 h264_qpel8_hv_lowpass avg
5813e05d 1067
04e7f6d2
MR
1068 .macro h264_qpel8_hv_lowpass_l2 type
1069function \type\()_h264_qpel8_hv_lowpass_l2_neon
5813e05d
MR
1070 mov r10, lr
1071 bl put_h264_qpel8_hv_lowpass_neon_top
1072
1073 vld1.64 {d0, d1}, [r2,:128]!
1074 vld1.64 {d2, d3}, [r2,:128]!
1075 vrhadd.u8 q0, q0, q6
1076 vld1.64 {d4, d5}, [r2,:128]!
1077 vrhadd.u8 q1, q1, q7
1078 vld1.64 {d6, d7}, [r2,:128]!
1079 vrhadd.u8 q2, q2, q4
5813e05d 1080 vrhadd.u8 q3, q3, q5
04e7f6d2
MR
1081.ifc \type,avg
1082 vld1.8 {d16}, [r0,:64], r3
1083 vrhadd.u8 d0, d0, d16
1084 vld1.8 {d17}, [r0,:64], r3
1085 vrhadd.u8 d1, d1, d17
1086 vld1.8 {d18}, [r0,:64], r3
1087 vrhadd.u8 d2, d2, d18
1088 vld1.8 {d19}, [r0,:64], r3
1089 vrhadd.u8 d3, d3, d19
1090 vld1.8 {d20}, [r0,:64], r3
1091 vrhadd.u8 d4, d4, d20
1092 vld1.8 {d21}, [r0,:64], r3
1093 vrhadd.u8 d5, d5, d21
1094 vld1.8 {d22}, [r0,:64], r3
1095 vrhadd.u8 d6, d6, d22
1096 vld1.8 {d23}, [r0,:64], r3
1097 vrhadd.u8 d7, d7, d23
1098 sub r0, r0, r3, lsl #3
1099.endif
1100 vst1.64 {d0}, [r0,:64], r3
5813e05d
MR
1101 vst1.64 {d1}, [r0,:64], r3
1102 vst1.64 {d2}, [r0,:64], r3
1103 vst1.64 {d3}, [r0,:64], r3
1104 vst1.64 {d4}, [r0,:64], r3
1105 vst1.64 {d5}, [r0,:64], r3
1106 vst1.64 {d6}, [r0,:64], r3
1107 vst1.64 {d7}, [r0,:64], r3
1108
1109 mov lr, r10
1110 bx lr
1111 .endfunc
04e7f6d2
MR
1112 .endm
1113
1114 h264_qpel8_hv_lowpass_l2 put
1115 h264_qpel8_hv_lowpass_l2 avg
5813e05d 1116
04e7f6d2
MR
1117 .macro h264_qpel16_hv type
1118function \type\()_h264_qpel16_hv_lowpass_neon
5813e05d 1119 mov r9, lr
04e7f6d2 1120 bl \type\()_h264_qpel8_hv_lowpass_neon
5813e05d 1121 sub r1, r1, r3, lsl #2
04e7f6d2 1122 bl \type\()_h264_qpel8_hv_lowpass_neon
5813e05d
MR
1123 sub r1, r1, r3, lsl #4
1124 sub r1, r1, r3, lsl #2
1125 add r1, r1, #8
1126 sub r0, r0, r2, lsl #4
1127 add r0, r0, #8
04e7f6d2 1128 bl \type\()_h264_qpel8_hv_lowpass_neon
5813e05d
MR
1129 sub r1, r1, r3, lsl #2
1130 mov lr, r9
04e7f6d2 1131 b \type\()_h264_qpel8_hv_lowpass_neon
5813e05d
MR
1132 .endfunc
1133
04e7f6d2 1134function \type\()_h264_qpel16_hv_lowpass_l2_neon
5813e05d
MR
1135 mov r9, lr
1136 sub r2, r4, #256
04e7f6d2 1137 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
5813e05d 1138 sub r1, r1, r3, lsl #2
04e7f6d2 1139 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
5813e05d
MR
1140 sub r1, r1, r3, lsl #4
1141 sub r1, r1, r3, lsl #2
1142 add r1, r1, #8
1143 sub r0, r0, r3, lsl #4
1144 add r0, r0, #8
04e7f6d2 1145 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
5813e05d
MR
1146 sub r1, r1, r3, lsl #2
1147 mov lr, r9
04e7f6d2 1148 b \type\()_h264_qpel8_hv_lowpass_l2_neon
5813e05d 1149 .endfunc
04e7f6d2 1150 .endm
5813e05d 1151
04e7f6d2
MR
1152 h264_qpel16_hv put
1153 h264_qpel16_hv avg
1154
1155 .macro h264_qpel8 type
1156function ff_\type\()_h264_qpel8_mc10_neon, export=1
5813e05d
MR
1157 lowpass_const r3
1158 mov r3, r1
1159 sub r1, r1, #2
1160 mov ip, #8
04e7f6d2 1161 b \type\()_h264_qpel8_h_lowpass_l2_neon
5813e05d
MR
1162 .endfunc
1163
04e7f6d2 1164function ff_\type\()_h264_qpel8_mc20_neon, export=1
5813e05d
MR
1165 lowpass_const r3
1166 sub r1, r1, #2
1167 mov r3, r2
1168 mov ip, #8
04e7f6d2 1169 b \type\()_h264_qpel8_h_lowpass_neon
5813e05d
MR
1170 .endfunc
1171
04e7f6d2 1172function ff_\type\()_h264_qpel8_mc30_neon, export=1
5813e05d
MR
1173 lowpass_const r3
1174 add r3, r1, #1
1175 sub r1, r1, #2
1176 mov ip, #8
04e7f6d2 1177 b \type\()_h264_qpel8_h_lowpass_l2_neon
5813e05d
MR
1178 .endfunc
1179
04e7f6d2 1180function ff_\type\()_h264_qpel8_mc01_neon, export=1
5813e05d
MR
1181 push {lr}
1182 mov ip, r1
04e7f6d2 1183\type\()_h264_qpel8_mc01:
5813e05d
MR
1184 lowpass_const r3
1185 mov r3, r2
1186 sub r1, r1, r2, lsl #1
1187 vpush {d8-d15}
04e7f6d2 1188 bl \type\()_h264_qpel8_v_lowpass_l2_neon
5813e05d
MR
1189 vpop {d8-d15}
1190 pop {pc}
1191 .endfunc
1192
04e7f6d2 1193function ff_\type\()_h264_qpel8_mc11_neon, export=1
0115b3ea 1194 push {r0, r1, r11, lr}
04e7f6d2 1195\type\()_h264_qpel8_mc11:
5813e05d 1196 lowpass_const r3
0115b3ea
MR
1197 mov r11, sp
1198 bic sp, sp, #15
5813e05d
MR
1199 sub sp, sp, #64
1200 mov r0, sp
1201 sub r1, r1, #2
1202 mov r3, #8
1203 mov ip, #8
1204 vpush {d8-d15}
1205 bl put_h264_qpel8_h_lowpass_neon
0115b3ea 1206 ldrd r0, [r11]
5813e05d
MR
1207 mov r3, r2
1208 add ip, sp, #64
1209 sub r1, r1, r2, lsl #1
1210 mov r2, #8
04e7f6d2 1211 bl \type\()_h264_qpel8_v_lowpass_l2_neon
5813e05d 1212 vpop {d8-d15}
0115b3ea
MR
1213 add sp, r11, #8
1214 pop {r11, pc}
5813e05d
MR
1215 .endfunc
1216
04e7f6d2 1217function ff_\type\()_h264_qpel8_mc21_neon, export=1
5813e05d 1218 push {r0, r1, r4, r10, r11, lr}
04e7f6d2 1219\type\()_h264_qpel8_mc21:
5813e05d
MR
1220 lowpass_const r3
1221 mov r11, sp
1222 bic sp, sp, #15
1223 sub sp, sp, #(8*8+16*12)
1224 sub r1, r1, #2
1225 mov r3, #8
1226 mov r0, sp
1227 mov ip, #8
1228 vpush {d8-d15}
1229 bl put_h264_qpel8_h_lowpass_neon
1230 mov r4, r0
1231 ldrd r0, [r11]
1232 sub r1, r1, r2, lsl #1
1233 sub r1, r1, #2
1234 mov r3, r2
1235 sub r2, r4, #64
04e7f6d2 1236 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
5813e05d
MR
1237 vpop {d8-d15}
1238 add sp, r11, #8
1239 pop {r4, r10, r11, pc}
1240 .endfunc
1241
04e7f6d2 1242function ff_\type\()_h264_qpel8_mc31_neon, export=1
5813e05d 1243 add r1, r1, #1
0115b3ea 1244 push {r0, r1, r11, lr}
5813e05d 1245 sub r1, r1, #1
04e7f6d2 1246 b \type\()_h264_qpel8_mc11
5813e05d
MR
1247 .endfunc
1248
04e7f6d2 1249function ff_\type\()_h264_qpel8_mc02_neon, export=1
5813e05d
MR
1250 push {lr}
1251 lowpass_const r3
1252 sub r1, r1, r2, lsl #1
1253 mov r3, r2
1254 vpush {d8-d15}
04e7f6d2 1255 bl \type\()_h264_qpel8_v_lowpass_neon
5813e05d
MR
1256 vpop {d8-d15}
1257 pop {pc}
1258 .endfunc
1259
04e7f6d2 1260function ff_\type\()_h264_qpel8_mc12_neon, export=1
5813e05d 1261 push {r0, r1, r4, r10, r11, lr}
04e7f6d2 1262\type\()_h264_qpel8_mc12:
5813e05d
MR
1263 lowpass_const r3
1264 mov r11, sp
1265 bic sp, sp, #15
1266 sub sp, sp, #(8*8+16*12)
1267 sub r1, r1, r2, lsl #1
1268 mov r3, r2
1269 mov r2, #8
1270 mov r0, sp
1271 vpush {d8-d15}
1272 bl put_h264_qpel8_v_lowpass_neon
1273 mov r4, r0
1274 ldrd r0, [r11]
1275 sub r1, r1, r3, lsl #1
1276 sub r1, r1, #2
1277 sub r2, r4, #64
04e7f6d2 1278 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
5813e05d
MR
1279 vpop {d8-d15}
1280 add sp, r11, #8
1281 pop {r4, r10, r11, pc}
1282 .endfunc
1283
04e7f6d2 1284function ff_\type\()_h264_qpel8_mc22_neon, export=1
5813e05d
MR
1285 push {r4, r10, r11, lr}
1286 mov r11, sp
1287 bic sp, sp, #15
1288 sub r1, r1, r2, lsl #1
1289 sub r1, r1, #2
1290 mov r3, r2
1291 sub sp, sp, #(16*12)
1292 mov r4, sp
1293 vpush {d8-d15}
04e7f6d2 1294 bl \type\()_h264_qpel8_hv_lowpass_neon
5813e05d
MR
1295 vpop {d8-d15}
1296 mov sp, r11
1297 pop {r4, r10, r11, pc}
1298 .endfunc
1299
04e7f6d2 1300function ff_\type\()_h264_qpel8_mc32_neon, export=1
5813e05d
MR
1301 push {r0, r1, r4, r10, r11, lr}
1302 add r1, r1, #1
04e7f6d2 1303 b \type\()_h264_qpel8_mc12
5813e05d
MR
1304 .endfunc
1305
04e7f6d2 1306function ff_\type\()_h264_qpel8_mc03_neon, export=1
5813e05d
MR
1307 push {lr}
1308 add ip, r1, r2
04e7f6d2 1309 b \type\()_h264_qpel8_mc01
5813e05d
MR
1310 .endfunc
1311
04e7f6d2 1312function ff_\type\()_h264_qpel8_mc13_neon, export=1
0115b3ea 1313 push {r0, r1, r11, lr}
5813e05d 1314 add r1, r1, r2
04e7f6d2 1315 b \type\()_h264_qpel8_mc11
5813e05d
MR
1316 .endfunc
1317
04e7f6d2 1318function ff_\type\()_h264_qpel8_mc23_neon, export=1
5813e05d
MR
1319 push {r0, r1, r4, r10, r11, lr}
1320 add r1, r1, r2
04e7f6d2 1321 b \type\()_h264_qpel8_mc21
5813e05d
MR
1322 .endfunc
1323
04e7f6d2 1324function ff_\type\()_h264_qpel8_mc33_neon, export=1
5813e05d 1325 add r1, r1, #1
0115b3ea 1326 push {r0, r1, r11, lr}
5813e05d
MR
1327 add r1, r1, r2
1328 sub r1, r1, #1
04e7f6d2 1329 b \type\()_h264_qpel8_mc11
5813e05d 1330 .endfunc
04e7f6d2
MR
1331 .endm
1332
1333 h264_qpel8 put
1334 h264_qpel8 avg
5813e05d 1335
04e7f6d2
MR
1336 .macro h264_qpel16 type
1337function ff_\type\()_h264_qpel16_mc10_neon, export=1
5813e05d
MR
1338 lowpass_const r3
1339 mov r3, r1
1340 sub r1, r1, #2
04e7f6d2 1341 b \type\()_h264_qpel16_h_lowpass_l2_neon
5813e05d
MR
1342 .endfunc
1343
04e7f6d2 1344function ff_\type\()_h264_qpel16_mc20_neon, export=1
5813e05d
MR
1345 lowpass_const r3
1346 sub r1, r1, #2
1347 mov r3, r2
04e7f6d2 1348 b \type\()_h264_qpel16_h_lowpass_neon
5813e05d
MR
1349 .endfunc
1350
04e7f6d2 1351function ff_\type\()_h264_qpel16_mc30_neon, export=1
5813e05d
MR
1352 lowpass_const r3
1353 add r3, r1, #1
1354 sub r1, r1, #2
04e7f6d2 1355 b \type\()_h264_qpel16_h_lowpass_l2_neon
5813e05d
MR
1356 .endfunc
1357
04e7f6d2 1358function ff_\type\()_h264_qpel16_mc01_neon, export=1
5813e05d
MR
1359 push {r4, lr}
1360 mov ip, r1
04e7f6d2 1361\type\()_h264_qpel16_mc01:
5813e05d
MR
1362 lowpass_const r3
1363 mov r3, r2
1364 sub r1, r1, r2, lsl #1
1365 vpush {d8-d15}
04e7f6d2 1366 bl \type\()_h264_qpel16_v_lowpass_l2_neon
5813e05d
MR
1367 vpop {d8-d15}
1368 pop {r4, pc}
1369 .endfunc
1370
04e7f6d2 1371function ff_\type\()_h264_qpel16_mc11_neon, export=1
0115b3ea 1372 push {r0, r1, r4, r11, lr}
04e7f6d2 1373\type\()_h264_qpel16_mc11:
5813e05d 1374 lowpass_const r3
0115b3ea
MR
1375 mov r11, sp
1376 bic sp, sp, #15
5813e05d
MR
1377 sub sp, sp, #256
1378 mov r0, sp
1379 sub r1, r1, #2
1380 mov r3, #16
1381 vpush {d8-d15}
1382 bl put_h264_qpel16_h_lowpass_neon
0115b3ea 1383 ldrd r0, [r11]
5813e05d
MR
1384 mov r3, r2
1385 add ip, sp, #64
1386 sub r1, r1, r2, lsl #1
1387 mov r2, #16
04e7f6d2 1388 bl \type\()_h264_qpel16_v_lowpass_l2_neon
5813e05d 1389 vpop {d8-d15}
0115b3ea
MR
1390 add sp, r11, #8
1391 pop {r4, r11, pc}
5813e05d
MR
1392 .endfunc
1393
04e7f6d2 1394function ff_\type\()_h264_qpel16_mc21_neon, export=1
5813e05d 1395 push {r0, r1, r4-r5, r9-r11, lr}
04e7f6d2 1396\type\()_h264_qpel16_mc21:
5813e05d
MR
1397 lowpass_const r3
1398 mov r11, sp
1399 bic sp, sp, #15
1400 sub sp, sp, #(16*16+16*12)
1401 sub r1, r1, #2
1402 mov r0, sp
1403 vpush {d8-d15}
1404 bl put_h264_qpel16_h_lowpass_neon_packed
1405 mov r4, r0
1406 ldrd r0, [r11]
1407 sub r1, r1, r2, lsl #1
1408 sub r1, r1, #2
1409 mov r3, r2
04e7f6d2 1410 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
5813e05d
MR
1411 vpop {d8-d15}
1412 add sp, r11, #8
1413 pop {r4-r5, r9-r11, pc}
1414 .endfunc
1415
04e7f6d2 1416function ff_\type\()_h264_qpel16_mc31_neon, export=1
5813e05d 1417 add r1, r1, #1
0115b3ea 1418 push {r0, r1, r4, r11, lr}
5813e05d 1419 sub r1, r1, #1
04e7f6d2 1420 b \type\()_h264_qpel16_mc11
5813e05d
MR
1421 .endfunc
1422
04e7f6d2 1423function ff_\type\()_h264_qpel16_mc02_neon, export=1
5813e05d
MR
1424 push {r4, lr}
1425 lowpass_const r3
1426 sub r1, r1, r2, lsl #1
1427 mov r3, r2
1428 vpush {d8-d15}
04e7f6d2 1429 bl \type\()_h264_qpel16_v_lowpass_neon
5813e05d
MR
1430 vpop {d8-d15}
1431 pop {r4, pc}
1432 .endfunc
1433
04e7f6d2 1434function ff_\type\()_h264_qpel16_mc12_neon, export=1
5813e05d 1435 push {r0, r1, r4-r5, r9-r11, lr}
04e7f6d2 1436\type\()_h264_qpel16_mc12:
5813e05d
MR
1437 lowpass_const r3
1438 mov r11, sp
1439 bic sp, sp, #15
1440 sub sp, sp, #(16*16+16*12)
1441 sub r1, r1, r2, lsl #1
1442 mov r0, sp
1443 mov r3, r2
1444 vpush {d8-d15}
1445 bl put_h264_qpel16_v_lowpass_neon_packed
1446 mov r4, r0
1447 ldrd r0, [r11]
1448 sub r1, r1, r3, lsl #1
1449 sub r1, r1, #2
1450 mov r2, r3
04e7f6d2 1451 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
5813e05d
MR
1452 vpop {d8-d15}
1453 add sp, r11, #8
1454 pop {r4-r5, r9-r11, pc}
1455 .endfunc
1456
04e7f6d2 1457function ff_\type\()_h264_qpel16_mc22_neon, export=1
5813e05d
MR
1458 push {r4, r9-r11, lr}
1459 lowpass_const r3
1460 mov r11, sp
1461 bic sp, sp, #15
1462 sub r1, r1, r2, lsl #1
1463 sub r1, r1, #2
1464 mov r3, r2
1465 sub sp, sp, #(16*12)
1466 mov r4, sp
1467 vpush {d8-d15}
04e7f6d2 1468 bl \type\()_h264_qpel16_hv_lowpass_neon
5813e05d
MR
1469 vpop {d8-d15}
1470 mov sp, r11
1471 pop {r4, r9-r11, pc}
1472 .endfunc
1473
04e7f6d2 1474function ff_\type\()_h264_qpel16_mc32_neon, export=1
5813e05d
MR
1475 push {r0, r1, r4-r5, r9-r11, lr}
1476 add r1, r1, #1
04e7f6d2 1477 b \type\()_h264_qpel16_mc12
5813e05d
MR
1478 .endfunc
1479
04e7f6d2 1480function ff_\type\()_h264_qpel16_mc03_neon, export=1
5813e05d
MR
1481 push {r4, lr}
1482 add ip, r1, r2
04e7f6d2 1483 b \type\()_h264_qpel16_mc01
5813e05d
MR
1484 .endfunc
1485
04e7f6d2 1486function ff_\type\()_h264_qpel16_mc13_neon, export=1
0115b3ea 1487 push {r0, r1, r4, r11, lr}
5813e05d 1488 add r1, r1, r2
04e7f6d2 1489 b \type\()_h264_qpel16_mc11
5813e05d
MR
1490 .endfunc
1491
04e7f6d2 1492function ff_\type\()_h264_qpel16_mc23_neon, export=1
5813e05d
MR
1493 push {r0, r1, r4-r5, r9-r11, lr}
1494 add r1, r1, r2
04e7f6d2 1495 b \type\()_h264_qpel16_mc21
5813e05d
MR
1496 .endfunc
1497
04e7f6d2 1498function ff_\type\()_h264_qpel16_mc33_neon, export=1
5813e05d 1499 add r1, r1, #1
0115b3ea 1500 push {r0, r1, r4, r11, lr}
5813e05d
MR
1501 add r1, r1, r2
1502 sub r1, r1, #1
04e7f6d2 1503 b \type\()_h264_qpel16_mc11
5813e05d 1504 .endfunc
04e7f6d2
MR
1505 .endm
1506
1507 h264_qpel16 put
1508 h264_qpel16 avg
5a29589b
MR
1509
1510@ Biweighted prediction
1511
1512 .macro biweight_16 macs, macd
1513 vdup.8 d0, r4
1514 vdup.8 d1, r5
1515 vmov q2, q8
1516 vmov q3, q8
15171: subs ip, ip, #2
1518 vld1.8 {d20-d21},[r0,:128], r2
1519 \macd q2, d0, d20
1520 pld [r0]
1521 \macd q3, d0, d21
1522 vld1.8 {d22-d23},[r1,:128], r2
1523 \macs q2, d1, d22
1524 pld [r1]
1525 \macs q3, d1, d23
1526 vmov q12, q8
1527 vld1.8 {d28-d29},[r0,:128], r2
1528 vmov q13, q8
1529 \macd q12, d0, d28
1530 pld [r0]
1531 \macd q13, d0, d29
1532 vld1.8 {d30-d31},[r1,:128], r2
1533 \macs q12, d1, d30
1534 pld [r1]
1535 \macs q13, d1, d31
1536 vshl.s16 q2, q2, q9
1537 vshl.s16 q3, q3, q9
1538 vqmovun.s16 d4, q2
1539 vqmovun.s16 d5, q3
1540 vshl.s16 q12, q12, q9
1541 vshl.s16 q13, q13, q9
1542 vqmovun.s16 d24, q12
1543 vqmovun.s16 d25, q13
1544 vmov q3, q8
1545 vst1.8 {d4- d5}, [r6,:128], r2
1546 vmov q2, q8
1547 vst1.8 {d24-d25},[r6,:128], r2
1548 bne 1b
1549 pop {r4-r6, pc}
1550 .endm
1551
1552 .macro biweight_8 macs, macd
1553 vdup.8 d0, r4
1554 vdup.8 d1, r5
1555 vmov q1, q8
1556 vmov q10, q8
15571: subs ip, ip, #2
1558 vld1.8 {d4},[r0,:64], r2
1559 \macd q1, d0, d4
1560 pld [r0]
1561 vld1.8 {d5},[r1,:64], r2
1562 \macs q1, d1, d5
1563 pld [r1]
1564 vld1.8 {d6},[r0,:64], r2
1565 \macd q10, d0, d6
1566 pld [r0]
1567 vld1.8 {d7},[r1,:64], r2
1568 \macs q10, d1, d7
1569 pld [r1]
1570 vshl.s16 q1, q1, q9
1571 vqmovun.s16 d2, q1
1572 vshl.s16 q10, q10, q9
1573 vqmovun.s16 d4, q10
1574 vmov q10, q8
1575 vst1.8 {d2},[r6,:64], r2
1576 vmov q1, q8
1577 vst1.8 {d4},[r6,:64], r2
1578 bne 1b
1579 pop {r4-r6, pc}
1580 .endm
1581
1582 .macro biweight_4 macs, macd
1583 vdup.8 d0, r4
1584 vdup.8 d1, r5
1585 vmov q1, q8
1586 vmov q10, q8
15871: subs ip, ip, #4
1588 vld1.32 {d4[0]},[r0,:32], r2
1589 vld1.32 {d4[1]},[r0,:32], r2
1590 \macd q1, d0, d4
1591 pld [r0]
1592 vld1.32 {d5[0]},[r1,:32], r2
1593 vld1.32 {d5[1]},[r1,:32], r2
1594 \macs q1, d1, d5
1595 pld [r1]
1596 blt 2f
1597 vld1.32 {d6[0]},[r0,:32], r2
1598 vld1.32 {d6[1]},[r0,:32], r2
1599 \macd q10, d0, d6
1600 pld [r0]
1601 vld1.32 {d7[0]},[r1,:32], r2
1602 vld1.32 {d7[1]},[r1,:32], r2
1603 \macs q10, d1, d7
1604 pld [r1]
1605 vshl.s16 q1, q1, q9
1606 vqmovun.s16 d2, q1
1607 vshl.s16 q10, q10, q9
1608 vqmovun.s16 d4, q10
1609 vmov q10, q8
1610 vst1.32 {d2[0]},[r6,:32], r2
1611 vst1.32 {d2[1]},[r6,:32], r2
1612 vmov q1, q8
1613 vst1.32 {d4[0]},[r6,:32], r2
1614 vst1.32 {d4[1]},[r6,:32], r2
1615 bne 1b
1616 pop {r4-r6, pc}
16172: vshl.s16 q1, q1, q9
1618 vqmovun.s16 d2, q1
1619 vst1.32 {d2[0]},[r6,:32], r2
1620 vst1.32 {d2[1]},[r6,:32], r2
1621 pop {r4-r6, pc}
1622 .endm
1623
1624 .macro biweight_func w
1625function biweight_h264_pixels_\w\()_neon
1626 push {r4-r6, lr}
1627 add r4, sp, #16
1628 ldm r4, {r4-r6}
1629 lsr lr, r4, #31
1630 add r6, r6, #1
1631 eors lr, lr, r5, lsr #30
1632 orr r6, r6, #1
1633 vdup.16 q9, r3
1634 lsl r6, r6, r3
1635 vmvn q9, q9
1636 vdup.16 q8, r6
1637 mov r6, r0
1638 beq 10f
1639 subs lr, lr, #1
1640 beq 20f
1641 subs lr, lr, #1
1642 beq 30f
1643 b 40f
164410: biweight_\w vmlal.u8, vmlal.u8
164520: rsb r4, r4, #0
1646 biweight_\w vmlal.u8, vmlsl.u8
164730: rsb r4, r4, #0
1648 rsb r5, r5, #0
1649 biweight_\w vmlsl.u8, vmlsl.u8
165040: rsb r5, r5, #0
1651 biweight_\w vmlsl.u8, vmlal.u8
1652 .endfunc
1653 .endm
1654
1655 .macro biweight_entry w, h, b=1
1656function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1657 mov ip, #\h
1658.if \b
1659 b biweight_h264_pixels_\w\()_neon
1660.endif
1661 .endfunc
1662 .endm
1663
1664 biweight_entry 16, 8
1665 biweight_entry 16, 16, b=0
1666 biweight_func 16
1667
1668 biweight_entry 8, 16
1669 biweight_entry 8, 4
1670 biweight_entry 8, 8, b=0
1671 biweight_func 8
1672
1673 biweight_entry 4, 8
1674 biweight_entry 4, 2
1675 biweight_entry 4, 4, b=0
1676 biweight_func 4
bd53b426
MR
1677
1678@ Weighted prediction
1679
fe7f149e 1680 .macro weight_16 add
bd53b426 1681 vdup.8 d0, r3
bd53b426
MR
16821: subs ip, ip, #2
1683 vld1.8 {d20-d21},[r0,:128], r1
fe7f149e 1684 vmull.u8 q2, d0, d20
bd53b426 1685 pld [r0]
fe7f149e 1686 vmull.u8 q3, d0, d21
bd53b426 1687 vld1.8 {d28-d29},[r0,:128], r1
fe7f149e 1688 vmull.u8 q12, d0, d28
bd53b426 1689 pld [r0]
fe7f149e
MR
1690 vmull.u8 q13, d0, d29
1691 \add q2, q8, q2
1692 vrshl.s16 q2, q2, q9
1693 \add q3, q8, q3
1694 vrshl.s16 q3, q3, q9
bd53b426
MR
1695 vqmovun.s16 d4, q2
1696 vqmovun.s16 d5, q3
fe7f149e
MR
1697 \add q12, q8, q12
1698 vrshl.s16 q12, q12, q9
1699 \add q13, q8, q13
1700 vrshl.s16 q13, q13, q9
bd53b426
MR
1701 vqmovun.s16 d24, q12
1702 vqmovun.s16 d25, q13
bd53b426 1703 vst1.8 {d4- d5}, [r4,:128], r1
bd53b426
MR
1704 vst1.8 {d24-d25},[r4,:128], r1
1705 bne 1b
1706 pop {r4, pc}
1707 .endm
1708
fe7f149e 1709 .macro weight_8 add
bd53b426 1710 vdup.8 d0, r3
bd53b426
MR
17111: subs ip, ip, #2
1712 vld1.8 {d4},[r0,:64], r1
fe7f149e 1713 vmull.u8 q1, d0, d4
bd53b426
MR
1714 pld [r0]
1715 vld1.8 {d6},[r0,:64], r1
fe7f149e
MR
1716 vmull.u8 q10, d0, d6
1717 \add q1, q8, q1
bd53b426 1718 pld [r0]
fe7f149e 1719 vrshl.s16 q1, q1, q9
bd53b426 1720 vqmovun.s16 d2, q1
fe7f149e
MR
1721 \add q10, q8, q10
1722 vrshl.s16 q10, q10, q9
bd53b426 1723 vqmovun.s16 d4, q10
bd53b426 1724 vst1.8 {d2},[r4,:64], r1
bd53b426
MR
1725 vst1.8 {d4},[r4,:64], r1
1726 bne 1b
1727 pop {r4, pc}
1728 .endm
1729
fe7f149e 1730 .macro weight_4 add
bd53b426
MR
1731 vdup.8 d0, r3
1732 vmov q1, q8
1733 vmov q10, q8
17341: subs ip, ip, #4
1735 vld1.32 {d4[0]},[r0,:32], r1
1736 vld1.32 {d4[1]},[r0,:32], r1
fe7f149e 1737 vmull.u8 q1, d0, d4
bd53b426
MR
1738 pld [r0]
1739 blt 2f
1740 vld1.32 {d6[0]},[r0,:32], r1
1741 vld1.32 {d6[1]},[r0,:32], r1
fe7f149e 1742 vmull.u8 q10, d0, d6
bd53b426 1743 pld [r0]
fe7f149e
MR
1744 \add q1, q8, q1
1745 vrshl.s16 q1, q1, q9
bd53b426 1746 vqmovun.s16 d2, q1
fe7f149e
MR
1747 \add q10, q8, q10
1748 vrshl.s16 q10, q10, q9
bd53b426
MR
1749 vqmovun.s16 d4, q10
1750 vmov q10, q8
1751 vst1.32 {d2[0]},[r4,:32], r1
1752 vst1.32 {d2[1]},[r4,:32], r1
1753 vmov q1, q8
1754 vst1.32 {d4[0]},[r4,:32], r1
1755 vst1.32 {d4[1]},[r4,:32], r1
1756 bne 1b
1757 pop {r4, pc}
fe7f149e
MR
17582: \add q1, q8, q1
1759 vrshl.s16 q1, q1, q9
bd53b426
MR
1760 vqmovun.s16 d2, q1
1761 vst1.32 {d2[0]},[r4,:32], r1
1762 vst1.32 {d2[1]},[r4,:32], r1
1763 pop {r4, pc}
1764 .endm
1765
1766 .macro weight_func w
1767function weight_h264_pixels_\w\()_neon
1768 push {r4, lr}
1769 ldr r4, [sp, #8]
fe7f149e 1770 cmp r2, #1
bd53b426 1771 lsl r4, r4, r2
bd53b426
MR
1772 vdup.16 q8, r4
1773 mov r4, r0
fe7f149e
MR
1774 ble 20f
1775 rsb lr, r2, #1
1776 vdup.16 q9, lr
1777 cmp r3, #0
1778 blt 10f
1779 weight_\w vhadd.s16
178010: rsb r3, r3, #0
1781 weight_\w vhsub.s16
178220: rsb lr, r2, #0
1783 vdup.16 q9, lr
1784 cmp r3, #0
bd53b426 1785 blt 10f
fe7f149e 1786 weight_\w vadd.s16
bd53b426 178710: rsb r3, r3, #0
fe7f149e 1788 weight_\w vsub.s16
bd53b426
MR
1789 .endfunc
1790 .endm
1791
1792 .macro weight_entry w, h, b=1
1793function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1794 mov ip, #\h
1795.if \b
1796 b weight_h264_pixels_\w\()_neon
1797.endif
1798 .endfunc
1799 .endm
1800
1801 weight_entry 16, 8
1802 weight_entry 16, 16, b=0
1803 weight_func 16
1804
1805 weight_entry 8, 16
1806 weight_entry 8, 4
1807 weight_entry 8, 8, b=0
1808 weight_func 8
1809
1810 weight_entry 4, 8
1811 weight_entry 4, 2
1812 weight_entry 4, 4, b=0
1813 weight_func 4