557353cb42f3d71fc1cf588cb0ee5985045bc0c7
[libav.git] / libavcodec / arm / vp9mc_neon.S
1 /*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/arm/asm.S"
22
23 const regular_filter, align=4
24 .short 0, 1, -5, 126, 8, -3, 1, 0
25 .short -1, 3, -10, 122, 18, -6, 2, 0
26 .short -1, 4, -13, 118, 27, -9, 3, -1
27 .short -1, 4, -16, 112, 37, -11, 4, -1
28 .short -1, 5, -18, 105, 48, -14, 4, -1
29 .short -1, 5, -19, 97, 58, -16, 5, -1
30 .short -1, 6, -19, 88, 68, -18, 5, -1
31 .short -1, 6, -19, 78, 78, -19, 6, -1
32 .short -1, 5, -18, 68, 88, -19, 6, -1
33 .short -1, 5, -16, 58, 97, -19, 5, -1
34 .short -1, 4, -14, 48, 105, -18, 5, -1
35 .short -1, 4, -11, 37, 112, -16, 4, -1
36 .short -1, 3, -9, 27, 118, -13, 4, -1
37 .short 0, 2, -6, 18, 122, -10, 3, -1
38 .short 0, 1, -3, 8, 126, -5, 1, 0
39 endconst
40
41 const sharp_filter, align=4
42 .short -1, 3, -7, 127, 8, -3, 1, 0
43 .short -2, 5, -13, 125, 17, -6, 3, -1
44 .short -3, 7, -17, 121, 27, -10, 5, -2
45 .short -4, 9, -20, 115, 37, -13, 6, -2
46 .short -4, 10, -23, 108, 48, -16, 8, -3
47 .short -4, 10, -24, 100, 59, -19, 9, -3
48 .short -4, 11, -24, 90, 70, -21, 10, -4
49 .short -4, 11, -23, 80, 80, -23, 11, -4
50 .short -4, 10, -21, 70, 90, -24, 11, -4
51 .short -3, 9, -19, 59, 100, -24, 10, -4
52 .short -3, 8, -16, 48, 108, -23, 10, -4
53 .short -2, 6, -13, 37, 115, -20, 9, -4
54 .short -2, 5, -10, 27, 121, -17, 7, -3
55 .short -1, 3, -6, 17, 125, -13, 5, -2
56 .short 0, 1, -3, 8, 127, -7, 3, -1
57 endconst
58
59 const smooth_filter, align=4
60 .short -3, -1, 32, 64, 38, 1, -3, 0
61 .short -2, -2, 29, 63, 41, 2, -3, 0
62 .short -2, -2, 26, 63, 43, 4, -4, 0
63 .short -2, -3, 24, 62, 46, 5, -4, 0
64 .short -2, -3, 21, 60, 49, 7, -4, 0
65 .short -1, -4, 18, 59, 51, 9, -4, 0
66 .short -1, -4, 16, 57, 53, 12, -4, -1
67 .short -1, -4, 14, 55, 55, 14, -4, -1
68 .short -1, -4, 12, 53, 57, 16, -4, -1
69 .short 0, -4, 9, 51, 59, 18, -4, -1
70 .short 0, -4, 7, 49, 60, 21, -3, -2
71 .short 0, -4, 5, 46, 62, 24, -3, -2
72 .short 0, -4, 4, 43, 63, 26, -2, -2
73 .short 0, -3, 2, 41, 63, 29, -2, -2
74 .short 0, -3, 1, 38, 64, 32, -1, -3
75 endconst
76
77 @ All public functions in this file have the following signature:
78 @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
79 @ const uint8_t *ref, ptrdiff_t ref_stride,
80 @ int h, int mx, int my);
81
82 function ff_vp9_copy64_neon, export=1
83 ldr r12, [sp]
84 sub r1, r1, #32
85 sub r3, r3, #32
86 1:
87 vld1.8 {q0, q1}, [r2]!
88 vst1.8 {q0, q1}, [r0, :128]!
89 vld1.8 {q2, q3}, [r2], r3
90 subs r12, r12, #1
91 vst1.8 {q2, q3}, [r0, :128], r1
92 bne 1b
93 bx lr
94 endfunc
95
96 function ff_vp9_avg64_neon, export=1
97 push {lr}
98 ldr r12, [sp, #4]
99 sub r1, r1, #32
100 sub r3, r3, #32
101 mov lr, r0
102 1:
103 vld1.8 {q8, q9}, [r2]!
104 vld1.8 {q0, q1}, [r0, :128]!
105 vld1.8 {q10, q11}, [r2], r3
106 vrhadd.u8 q0, q0, q8
107 vld1.8 {q2, q3}, [r0, :128], r1
108 vrhadd.u8 q1, q1, q9
109 vrhadd.u8 q2, q2, q10
110 vst1.8 {q0, q1}, [lr, :128]!
111 vrhadd.u8 q3, q3, q11
112 vst1.8 {q2, q3}, [lr, :128], r1
113 subs r12, r12, #1
114 bne 1b
115 pop {pc}
116 endfunc
117
118 function ff_vp9_copy32_neon, export=1
119 ldr r12, [sp]
120 1:
121 vld1.8 {q0, q1}, [r2], r3
122 subs r12, r12, #1
123 vst1.8 {q0, q1}, [r0, :128], r1
124 bne 1b
125 bx lr
126 endfunc
127
128 function ff_vp9_avg32_neon, export=1
129 ldr r12, [sp]
130 1:
131 vld1.8 {q2, q3}, [r2], r3
132 vld1.8 {q0, q1}, [r0, :128]
133 vrhadd.u8 q0, q0, q2
134 vrhadd.u8 q1, q1, q3
135 subs r12, r12, #1
136 vst1.8 {q0, q1}, [r0, :128], r1
137 bne 1b
138 bx lr
139 endfunc
140
141 function ff_vp9_copy16_neon, export=1
142 push {r4,lr}
143 ldr r12, [sp, #8]
144 add r4, r0, r1
145 add lr, r2, r3
146 add r1, r1, r1
147 add r3, r3, r3
148 1:
149 vld1.8 {q0}, [r2], r3
150 vld1.8 {q1}, [lr], r3
151 subs r12, r12, #2
152 vst1.8 {q0}, [r0, :128], r1
153 vst1.8 {q1}, [r4, :128], r1
154 bne 1b
155 pop {r4,pc}
156 endfunc
157
158 function ff_vp9_avg16_neon, export=1
159 ldr r12, [sp]
160 1:
161 vld1.8 {q2}, [r2], r3
162 vld1.8 {q0}, [r0, :128], r1
163 vld1.8 {q3}, [r2], r3
164 vrhadd.u8 q0, q0, q2
165 vld1.8 {q1}, [r0, :128]
166 sub r0, r0, r1
167 vrhadd.u8 q1, q1, q3
168 subs r12, r12, #2
169 vst1.8 {q0}, [r0, :128], r1
170 vst1.8 {q1}, [r0, :128], r1
171 bne 1b
172 bx lr
173 endfunc
174
175 function ff_vp9_copy8_neon, export=1
176 ldr r12, [sp]
177 1:
178 vld1.8 {d0}, [r2], r3
179 vld1.8 {d1}, [r2], r3
180 subs r12, r12, #2
181 vst1.8 {d0}, [r0, :64], r1
182 vst1.8 {d1}, [r0, :64], r1
183 bne 1b
184 bx lr
185 endfunc
186
187 function ff_vp9_avg8_neon, export=1
188 ldr r12, [sp]
189 1:
190 vld1.8 {d2}, [r2], r3
191 vld1.8 {d0}, [r0, :64], r1
192 vld1.8 {d3}, [r2], r3
193 vrhadd.u8 d0, d0, d2
194 vld1.8 {d1}, [r0, :64]
195 sub r0, r0, r1
196 vrhadd.u8 d1, d1, d3
197 subs r12, r12, #2
198 vst1.8 {d0}, [r0, :64], r1
199 vst1.8 {d1}, [r0, :64], r1
200 bne 1b
201 bx lr
202 endfunc
203
204 function ff_vp9_copy4_neon, export=1
205 ldr r12, [sp]
206 1:
207 vld1.32 {d0[]}, [r2], r3
208 vld1.32 {d1[]}, [r2], r3
209 vst1.32 {d0[0]}, [r0, :32], r1
210 vld1.32 {d2[]}, [r2], r3
211 vst1.32 {d1[0]}, [r0, :32], r1
212 vld1.32 {d3[]}, [r2], r3
213 subs r12, r12, #4
214 vst1.32 {d2[0]}, [r0, :32], r1
215 vst1.32 {d3[0]}, [r0, :32], r1
216 bne 1b
217 bx lr
218 endfunc
219
220 function ff_vp9_avg4_neon, export=1
221 ldr r12, [sp]
222 1:
223 vld1.32 {d4[]}, [r2], r3
224 vld1.32 {d0[]}, [r0, :32], r1
225 vld1.32 {d5[]}, [r2], r3
226 vrhadd.u8 d0, d0, d4
227 vld1.32 {d1[]}, [r0, :32], r1
228 vld1.32 {d6[]}, [r2], r3
229 vrhadd.u8 d1, d1, d5
230 vld1.32 {d2[]}, [r0, :32], r1
231 vld1.32 {d7[]}, [r2], r3
232 vrhadd.u8 d2, d2, d6
233 vld1.32 {d3[]}, [r0, :32], r1
234 sub r0, r0, r1, lsl #2
235 subs r12, r12, #4
236 vst1.32 {d0[0]}, [r0, :32], r1
237 vrhadd.u8 d3, d3, d7
238 vst1.32 {d1[0]}, [r0, :32], r1
239 vst1.32 {d2[0]}, [r0, :32], r1
240 vst1.32 {d3[0]}, [r0, :32], r1
241 bne 1b
242 bx lr
243 endfunc
244
245 @ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index
246 .macro vmul_lane dst, src, idx
247 .if \idx < 4
248 vmul.s16 \dst, \src, d0[\idx]
249 .else
250 vmul.s16 \dst, \src, d1[\idx - 4]
251 .endif
252 .endm
253 .macro vmla_lane dst, src, idx
254 .if \idx < 4
255 vmla.s16 \dst, \src, d0[\idx]
256 .else
257 vmla.s16 \dst, \src, d1[\idx - 4]
258 .endif
259 .endm
260
261 @ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
262 @ for size >= 16), and multiply-accumulate into dst1 and dst3 (or
263 @ dst1-dst2 and dst3-dst4 for size >= 16)
264 .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
265 vext.8 q14, \src1, \src2, #(2*\offset)
266 vext.8 q15, \src4, \src5, #(2*\offset)
267 .if \size >= 16
268 vmla_lane \dst1, q14, \offset
269 vext.8 q5, \src2, \src3, #(2*\offset)
270 vmla_lane \dst3, q15, \offset
271 vext.8 q6, \src5, \src6, #(2*\offset)
272 vmla_lane \dst2, q5, \offset
273 vmla_lane \dst4, q6, \offset
274 .else
275 vmla_lane \dst1, q14, \offset
276 vmla_lane \dst3, q15, \offset
277 .endif
278 .endm
279 @ The same as above, but don't accumulate straight into the
280 @ destination, but use a temp register and accumulate with saturation.
281 .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
282 vext.8 q14, \src1, \src2, #(2*\offset)
283 vext.8 q15, \src4, \src5, #(2*\offset)
284 .if \size >= 16
285 vmul_lane q14, q14, \offset
286 vext.8 q5, \src2, \src3, #(2*\offset)
287 vmul_lane q15, q15, \offset
288 vext.8 q6, \src5, \src6, #(2*\offset)
289 vmul_lane q5, q5, \offset
290 vmul_lane q6, q6, \offset
291 .else
292 vmul_lane q14, q14, \offset
293 vmul_lane q15, q15, \offset
294 .endif
295 vqadd.s16 \dst1, \dst1, q14
296 vqadd.s16 \dst3, \dst3, q15
297 .if \size >= 16
298 vqadd.s16 \dst2, \dst2, q5
299 vqadd.s16 \dst4, \dst4, q6
300 .endif
301 .endm
302
303
304 @ Instantiate a horizontal filter function for the given size.
305 @ This can work on 4, 8 or 16 pixels in parallel; for larger
306 @ widths it will do 16 pixels at a time and loop horizontally.
307 @ The actual width is passed in r5, the height in r4 and
308 @ the filter coefficients in r12. idx2 is the index of the largest
309 @ filter coefficient (3 or 4) and idx1 is the other one of them.
310 .macro do_8tap_h type, size, idx1, idx2
311 function \type\()_8tap_\size\()h_\idx1\idx2
312 sub r2, r2, #3
313 add r6, r0, r1
314 add r7, r2, r3
315 add r1, r1, r1
316 add r3, r3, r3
317 @ Only size >= 16 loops horizontally and needs
318 @ reduced dst stride
319 .if \size >= 16
320 sub r1, r1, r5
321 .endif
322 @ size >= 16 loads two qwords and increments r2,
323 @ for size 4/8 it's enough with one qword and no
324 @ postincrement
325 .if \size >= 16
326 sub r3, r3, r5
327 sub r3, r3, #8
328 .endif
329 @ Load the filter vector
330 vld1.16 {q0}, [r12,:128]
331 1:
332 .if \size >= 16
333 mov r12, r5
334 .endif
335 @ Load src
336 .if \size >= 16
337 vld1.8 {d18, d19, d20}, [r2]!
338 vld1.8 {d24, d25, d26}, [r7]!
339 .else
340 vld1.8 {q9}, [r2]
341 vld1.8 {q12}, [r7]
342 .endif
343 vmovl.u8 q8, d18
344 vmovl.u8 q9, d19
345 vmovl.u8 q11, d24
346 vmovl.u8 q12, d25
347 .if \size >= 16
348 vmovl.u8 q10, d20
349 vmovl.u8 q13, d26
350 .endif
351 2:
352
353 @ Accumulate, adding idx2 last with a separate
354 @ saturating add. The positive filter coefficients
355 @ for all indices except idx2 must add up to less
356 @ than 127 for this not to overflow.
357 vmul.s16 q1, q8, d0[0]
358 vmul.s16 q3, q11, d0[0]
359 .if \size >= 16
360 vmul.s16 q2, q9, d0[0]
361 vmul.s16 q4, q12, d0[0]
362 .endif
363 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 1, \size
364 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 2, \size
365 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, \idx1, \size
366 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 5, \size
367 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 6, \size
368 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 7, \size
369 extmulqadd q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, \idx2, \size
370
371 @ Round, shift and saturate
372 vqrshrun.s16 d2, q1, #7
373 vqrshrun.s16 d6, q3, #7
374 .if \size >= 16
375 vqrshrun.s16 d3, q2, #7
376 vqrshrun.s16 d7, q4, #7
377 .endif
378 @ Average
379 .ifc \type,avg
380 .if \size >= 16
381 vld1.8 {q14}, [r0,:128]
382 vld1.8 {q15}, [r6,:128]
383 vrhadd.u8 q1, q1, q14
384 vrhadd.u8 q3, q3, q15
385 .elseif \size == 8
386 vld1.8 {d28}, [r0,:64]
387 vld1.8 {d30}, [r6,:64]
388 vrhadd.u8 d2, d2, d28
389 vrhadd.u8 d6, d6, d30
390 .else
391 @ We only need d28[0], but [] is faster on some cores
392 vld1.32 {d28[]}, [r0,:32]
393 vld1.32 {d30[]}, [r6,:32]
394 vrhadd.u8 d2, d2, d28
395 vrhadd.u8 d6, d6, d30
396 .endif
397 .endif
398 @ Store and loop horizontally (for size >= 16)
399 .if \size >= 16
400 vst1.8 {q1}, [r0,:128]!
401 vst1.8 {q3}, [r6,:128]!
402 vmov q8, q10
403 vmov q11, q13
404 subs r12, r12, #16
405 beq 3f
406 vld1.8 {q10}, [r2]!
407 vld1.8 {q13}, [r7]!
408 vmovl.u8 q9, d20
409 vmovl.u8 q10, d21
410 vmovl.u8 q12, d26
411 vmovl.u8 q13, d27
412 b 2b
413 .elseif \size == 8
414 vst1.8 {d2}, [r0,:64]
415 vst1.8 {d6}, [r6,:64]
416 .else @ \size == 4
417 vst1.32 {d2[0]}, [r0,:32]
418 vst1.32 {d6[0]}, [r6,:32]
419 .endif
420 3:
421 @ Loop vertically
422 add r0, r0, r1
423 add r6, r6, r1
424 add r2, r2, r3
425 add r7, r7, r3
426 subs r4, r4, #2
427 bne 1b
428 .if \size >= 16
429 vpop {q4-q6}
430 .endif
431 pop {r4-r7}
432 bx lr
433 endfunc
434 .endm
435
436 .macro do_8tap_h_size size
437 do_8tap_h put, \size, 3, 4
438 do_8tap_h avg, \size, 3, 4
439 do_8tap_h put, \size, 4, 3
440 do_8tap_h avg, \size, 4, 3
441 .endm
442
443 do_8tap_h_size 4
444 do_8tap_h_size 8
445 do_8tap_h_size 16
446
447 .macro do_8tap_h_func type, filter, size
448 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
449 push {r4-r7}
450 .if \size >= 16
451 vpush {q4-q6}
452 ldr r4, [sp, #64]
453 ldr r5, [sp, #68]
454 .else
455 ldr r4, [sp, #16]
456 ldr r5, [sp, #20]
457 .endif
458 movrel r12, \filter\()_filter-16
459 cmp r5, #8
460 add r12, r12, r5, lsl #4
461 mov r5, #\size
462 .if \size >= 16
463 bge \type\()_8tap_16h_34
464 b \type\()_8tap_16h_43
465 .else
466 bge \type\()_8tap_\size\()h_34
467 b \type\()_8tap_\size\()h_43
468 .endif
469 endfunc
470 .endm
471
472 .macro do_8tap_h_filters size
473 do_8tap_h_func put, regular, \size
474 do_8tap_h_func avg, regular, \size
475 do_8tap_h_func put, sharp, \size
476 do_8tap_h_func avg, sharp, \size
477 do_8tap_h_func put, smooth, \size
478 do_8tap_h_func avg, smooth, \size
479 .endm
480
481 do_8tap_h_filters 64
482 do_8tap_h_filters 32
483 do_8tap_h_filters 16
484 do_8tap_h_filters 8
485 do_8tap_h_filters 4
486
487
488 @ Vertical filters
489
490 @ Round, shift and saturate and store qreg1-2 over 4 lines
491 .macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type
492 vqrshrun.s16 \dreg1, \qreg1, #7
493 vqrshrun.s16 \dreg2, \qreg2, #7
494 .ifc \type,avg
495 vld1.32 {\tmp1[]}, [r0,:32], r1
496 vld1.32 {\tmp2[]}, [r0,:32], r1
497 vld1.32 {\tmp1[1]}, [r0,:32], r1
498 vld1.32 {\tmp2[1]}, [r0,:32], r1
499 vrhadd.u8 \dreg1, \dreg1, \tmp1
500 vrhadd.u8 \dreg2, \dreg2, \tmp2
501 sub r0, r0, r1, lsl #2
502 .endif
503 vst1.32 {\dreg1[0]}, [r0,:32], r1
504 vst1.32 {\dreg2[0]}, [r0,:32], r1
505 vst1.32 {\dreg1[1]}, [r0,:32], r1
506 vst1.32 {\dreg2[1]}, [r0,:32], r1
507 .endm
508
509 @ Round, shift and saturate and store qreg1-4
510 .macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type
511 vqrshrun.s16 \dreg1, \qreg1, #7
512 vqrshrun.s16 \dreg2, \qreg2, #7
513 vqrshrun.s16 \dreg3, \qreg3, #7
514 vqrshrun.s16 \dreg4, \qreg4, #7
515 .ifc \type,avg
516 vld1.8 {\tmp1}, [r0,:64], r1
517 vld1.8 {\tmp2}, [r0,:64], r1
518 vld1.8 {\tmp3}, [r0,:64], r1
519 vld1.8 {\tmp4}, [r0,:64], r1
520 vrhadd.u8 \dreg1, \dreg1, \tmp1
521 vrhadd.u8 \dreg2, \dreg2, \tmp2
522 vrhadd.u8 \dreg3, \dreg3, \tmp3
523 vrhadd.u8 \dreg4, \dreg4, \tmp4
524 sub r0, r0, r1, lsl #2
525 .endif
526 vst1.8 {\dreg1}, [r0,:64], r1
527 vst1.8 {\dreg2}, [r0,:64], r1
528 vst1.8 {\dreg3}, [r0,:64], r1
529 vst1.8 {\dreg4}, [r0,:64], r1
530 .endm
531
532 @ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
533 @ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
534 @ at the end with saturation. Indices 0 and 7 always have negative or zero
535 @ coefficients, so they can be accumulated into tmp1-tmp2 together with the
536 @ largest coefficient.
537 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
538 vmul.s16 \dst1, \src2, d0[1]
539 vmul.s16 \dst2, \src3, d0[1]
540 vmul.s16 \tmp1, \src1, d0[0]
541 vmul.s16 \tmp2, \src2, d0[0]
542 vmla.s16 \dst1, \src3, d0[2]
543 vmla.s16 \dst2, \src4, d0[2]
544 .if \idx1 == 3
545 vmla.s16 \dst1, \src4, d0[3]
546 vmla.s16 \dst2, \src5, d0[3]
547 .else
548 vmla.s16 \dst1, \src5, d1[0]
549 vmla.s16 \dst2, \src6, d1[0]
550 .endif
551 vmla.s16 \dst1, \src6, d1[1]
552 vmla.s16 \dst2, \src7, d1[1]
553 vmla.s16 \tmp1, \src8, d1[3]
554 vmla.s16 \tmp2, \src9, d1[3]
555 vmla.s16 \dst1, \src7, d1[2]
556 vmla.s16 \dst2, \src8, d1[2]
557 .if \idx2 == 3
558 vmla.s16 \tmp1, \src4, d0[3]
559 vmla.s16 \tmp2, \src5, d0[3]
560 .else
561 vmla.s16 \tmp1, \src5, d1[0]
562 vmla.s16 \tmp2, \src6, d1[0]
563 .endif
564 vqadd.s16 \dst1, \dst1, \tmp1
565 vqadd.s16 \dst2, \dst2, \tmp2
566 .endm
567
568 @ Load pixels and extend them to 16 bit
569 .macro loadl dst1, dst2, dst3, dst4
570 vld1.8 {d2}, [r2], r3
571 vld1.8 {d3}, [r2], r3
572 vld1.8 {d4}, [r2], r3
573 .ifnb \dst4
574 vld1.8 {d5}, [r2], r3
575 .endif
576 vmovl.u8 \dst1, d2
577 vmovl.u8 \dst2, d3
578 vmovl.u8 \dst3, d4
579 .ifnb \dst4
580 vmovl.u8 \dst4, d5
581 .endif
582 .endm
583
584 @ Instantiate a vertical filter function for filtering 8 pixels at a time.
585 @ The height is passed in r4, the width in r5 and the filter coefficients
586 @ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
587 @ and idx1 is the other one of them.
588 .macro do_8tap_8v type, idx1, idx2
589 function \type\()_8tap_8v_\idx1\idx2
590 sub r2, r2, r3, lsl #1
591 sub r2, r2, r3
592 vld1.16 {q0}, [r12, :128]
593 1:
594 mov r12, r4
595
596 loadl q5, q6, q7
597 loadl q8, q9, q10, q11
598 2:
599 loadl q12, q13, q14, q15
600 convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q5
601 convolve q3, q4, q7, q8, q9, q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5, q6
602 do_store q1, d2, q2, d4, q3, d6, q4, d8, d3, d5, d7, d9, \type
603
604 subs r12, r12, #4
605 beq 8f
606
607 loadl q4, q5, q6, q7
608 convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q4, q5, \idx1, \idx2, q8, q9
609 convolve q3, q8, q11, q12, q13, q14, q15, q4, q5, q6, q7, \idx1, \idx2, q9, q10
610 do_store q1, d2, q2, d4, q3, d6, q8, d16, d3, d5, d7, d17, \type
611
612 subs r12, r12, #4
613 beq 8f
614
615 loadl q8, q9, q10, q11
616 convolve q1, q2, q13, q14, q15, q4, q5, q6, q7, q8, q9, \idx1, \idx2, q12, q13
617 convolve q3, q12, q15, q4, q5, q6, q7, q8, q9, q10, q11, \idx1, \idx2, q13, q14
618 do_store q1, d2, q2, d4, q3, d6, q12, d24, d3, d5, d7, d25, \type
619
620 subs r12, r12, #4
621 bne 2b
622
623 8:
624 subs r5, r5, #8
625 beq 9f
626 @ r0 -= h * dst_stride
627 mls r0, r1, r4, r0
628 @ r2 -= h * src_stride
629 mls r2, r3, r4, r2
630 @ r2 -= 8 * src_stride
631 sub r2, r2, r3, lsl #3
632 @ r2 += 1 * src_stride
633 add r2, r2, r3
634 add r2, r2, #8
635 add r0, r0, #8
636 b 1b
637 9:
638 vpop {q4-q7}
639 pop {r4-r5}
640 bx lr
641 endfunc
642 .endm
643
644 do_8tap_8v put, 3, 4
645 do_8tap_8v put, 4, 3
646 do_8tap_8v avg, 3, 4
647 do_8tap_8v avg, 4, 3
648
649 @ Instantiate a vertical filter function for filtering a 4 pixels wide
650 @ slice. The first half of the registers contain one row, while the second
651 @ half of a register contains the second-next row (also stored in the first
652 @ half of the register two steps ahead). The convolution does two outputs
653 @ at a time; the output of q5-q12 into one, and q4-q13 into another one.
654 @ The first half of first output is the first output row, the first half
655 @ of the other output is the second output row. The second halves of the
656 @ registers are rows 3 and 4.
657 @ This only is designed to work for 4 or 8 output lines.
658 .macro do_8tap_4v type, idx1, idx2
659 function \type\()_8tap_4v_\idx1\idx2
660 sub r2, r2, r3, lsl #1
661 sub r2, r2, r3
662 vld1.16 {q0}, [r12, :128]
663
664 vld1.32 {d2[]}, [r2], r3
665 vld1.32 {d3[]}, [r2], r3
666 vld1.32 {d4[]}, [r2], r3
667 vld1.32 {d5[]}, [r2], r3
668 vld1.32 {d6[]}, [r2], r3
669 vld1.32 {d7[]}, [r2], r3
670 vext.8 d2, d2, d4, #4
671 vld1.32 {d8[]}, [r2], r3
672 vext.8 d3, d3, d5, #4
673 vld1.32 {d9[]}, [r2], r3
674 vmovl.u8 q5, d2
675 vext.8 d4, d4, d6, #4
676 vld1.32 {d28[]}, [r2], r3
677 vmovl.u8 q6, d3
678 vext.8 d5, d5, d7, #4
679 vld1.32 {d29[]}, [r2], r3
680 vmovl.u8 q7, d4
681 vext.8 d6, d6, d8, #4
682 vld1.32 {d30[]}, [r2], r3
683 vmovl.u8 q8, d5
684 vext.8 d7, d7, d9, #4
685 vmovl.u8 q9, d6
686 vext.8 d8, d8, d28, #4
687 vmovl.u8 q10, d7
688 vext.8 d9, d9, d29, #4
689 vmovl.u8 q11, d8
690 vext.8 d28, d28, d30, #4
691 vmovl.u8 q12, d9
692 vmovl.u8 q13, d28
693
694 convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q3
695 do_store4 q1, d2, q2, d4, d3, d5, \type
696 subs r4, r4, #4
697 beq 9f
698
699 vld1.32 {d2[]}, [r2], r3
700 vld1.32 {d3[]}, [r2], r3
701 vext.8 d29, d29, d2, #4
702 vext.8 d30, d30, d3, #4
703 vld1.32 {d2[1]}, [r2], r3
704 vmovl.u8 q14, d29
705 vld1.32 {d3[1]}, [r2], r3
706 vmovl.u8 q15, d30
707 vmovl.u8 q5, d2
708 vmovl.u8 q6, d3
709
710 convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q5, q6, \idx1, \idx2, q4, q3
711 do_store4 q1, d2, q2, d4, d3, d5, \type
712
713 9:
714 vpop {q4-q7}
715 pop {r4-r5}
716 bx lr
717 endfunc
718 .endm
719
720 do_8tap_4v put, 3, 4
721 do_8tap_4v put, 4, 3
722 do_8tap_4v avg, 3, 4
723 do_8tap_4v avg, 4, 3
724
725 .macro do_8tap_v_func type, filter, size
726 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
727 push {r4-r5}
728 vpush {q4-q7}
729 ldr r4, [sp, #72]
730 ldr r5, [sp, #80]
731 movrel r12, \filter\()_filter-16
732 add r12, r12, r5, lsl #4
733 cmp r5, #8
734 mov r5, #\size
735 .if \size >= 8
736 bge \type\()_8tap_8v_34
737 b \type\()_8tap_8v_43
738 .else
739 bge \type\()_8tap_4v_34
740 b \type\()_8tap_4v_43
741 .endif
742 endfunc
743 .endm
744
745 .macro do_8tap_v_filters size
746 do_8tap_v_func put, regular, \size
747 do_8tap_v_func avg, regular, \size
748 do_8tap_v_func put, sharp, \size
749 do_8tap_v_func avg, sharp, \size
750 do_8tap_v_func put, smooth, \size
751 do_8tap_v_func avg, smooth, \size
752 .endm
753
754 do_8tap_v_filters 64
755 do_8tap_v_filters 32
756 do_8tap_v_filters 16
757 do_8tap_v_filters 8
758 do_8tap_v_filters 4