arm: vp9mc: Insert a literal pool at the middle of the file
[libav.git] / libavcodec / arm / vp9mc_neon.S
1 /*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/arm/asm.S"
22
23 const regular_filter, align=4
24 .short 0, 1, -5, 126, 8, -3, 1, 0
25 .short -1, 3, -10, 122, 18, -6, 2, 0
26 .short -1, 4, -13, 118, 27, -9, 3, -1
27 .short -1, 4, -16, 112, 37, -11, 4, -1
28 .short -1, 5, -18, 105, 48, -14, 4, -1
29 .short -1, 5, -19, 97, 58, -16, 5, -1
30 .short -1, 6, -19, 88, 68, -18, 5, -1
31 .short -1, 6, -19, 78, 78, -19, 6, -1
32 .short -1, 5, -18, 68, 88, -19, 6, -1
33 .short -1, 5, -16, 58, 97, -19, 5, -1
34 .short -1, 4, -14, 48, 105, -18, 5, -1
35 .short -1, 4, -11, 37, 112, -16, 4, -1
36 .short -1, 3, -9, 27, 118, -13, 4, -1
37 .short 0, 2, -6, 18, 122, -10, 3, -1
38 .short 0, 1, -3, 8, 126, -5, 1, 0
39 endconst
40
41 const sharp_filter, align=4
42 .short -1, 3, -7, 127, 8, -3, 1, 0
43 .short -2, 5, -13, 125, 17, -6, 3, -1
44 .short -3, 7, -17, 121, 27, -10, 5, -2
45 .short -4, 9, -20, 115, 37, -13, 6, -2
46 .short -4, 10, -23, 108, 48, -16, 8, -3
47 .short -4, 10, -24, 100, 59, -19, 9, -3
48 .short -4, 11, -24, 90, 70, -21, 10, -4
49 .short -4, 11, -23, 80, 80, -23, 11, -4
50 .short -4, 10, -21, 70, 90, -24, 11, -4
51 .short -3, 9, -19, 59, 100, -24, 10, -4
52 .short -3, 8, -16, 48, 108, -23, 10, -4
53 .short -2, 6, -13, 37, 115, -20, 9, -4
54 .short -2, 5, -10, 27, 121, -17, 7, -3
55 .short -1, 3, -6, 17, 125, -13, 5, -2
56 .short 0, 1, -3, 8, 127, -7, 3, -1
57 endconst
58
59 const smooth_filter, align=4
60 .short -3, -1, 32, 64, 38, 1, -3, 0
61 .short -2, -2, 29, 63, 41, 2, -3, 0
62 .short -2, -2, 26, 63, 43, 4, -4, 0
63 .short -2, -3, 24, 62, 46, 5, -4, 0
64 .short -2, -3, 21, 60, 49, 7, -4, 0
65 .short -1, -4, 18, 59, 51, 9, -4, 0
66 .short -1, -4, 16, 57, 53, 12, -4, -1
67 .short -1, -4, 14, 55, 55, 14, -4, -1
68 .short -1, -4, 12, 53, 57, 16, -4, -1
69 .short 0, -4, 9, 51, 59, 18, -4, -1
70 .short 0, -4, 7, 49, 60, 21, -3, -2
71 .short 0, -4, 5, 46, 62, 24, -3, -2
72 .short 0, -4, 4, 43, 63, 26, -2, -2
73 .short 0, -3, 2, 41, 63, 29, -2, -2
74 .short 0, -3, 1, 38, 64, 32, -1, -3
75 endconst
76
77 @ All public functions in this file have the following signature:
78 @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
79 @ const uint8_t *ref, ptrdiff_t ref_stride,
80 @ int h, int mx, int my);
81
82 function ff_vp9_copy64_neon, export=1
83 ldr r12, [sp]
84 sub r1, r1, #32
85 sub r3, r3, #32
86 1:
87 vld1.8 {q0, q1}, [r2]!
88 vst1.8 {q0, q1}, [r0, :128]!
89 vld1.8 {q2, q3}, [r2], r3
90 subs r12, r12, #1
91 vst1.8 {q2, q3}, [r0, :128], r1
92 bne 1b
93 bx lr
94 endfunc
95
96 function ff_vp9_avg64_neon, export=1
97 push {lr}
98 ldr r12, [sp, #4]
99 sub r1, r1, #32
100 sub r3, r3, #32
101 mov lr, r0
102 1:
103 vld1.8 {q8, q9}, [r2]!
104 vld1.8 {q0, q1}, [r0, :128]!
105 vld1.8 {q10, q11}, [r2], r3
106 vrhadd.u8 q0, q0, q8
107 vld1.8 {q2, q3}, [r0, :128], r1
108 vrhadd.u8 q1, q1, q9
109 vrhadd.u8 q2, q2, q10
110 vst1.8 {q0, q1}, [lr, :128]!
111 vrhadd.u8 q3, q3, q11
112 vst1.8 {q2, q3}, [lr, :128], r1
113 subs r12, r12, #1
114 bne 1b
115 pop {pc}
116 endfunc
117
118 function ff_vp9_copy32_neon, export=1
119 ldr r12, [sp]
120 1:
121 vld1.8 {q0, q1}, [r2], r3
122 subs r12, r12, #1
123 vst1.8 {q0, q1}, [r0, :128], r1
124 bne 1b
125 bx lr
126 endfunc
127
128 function ff_vp9_avg32_neon, export=1
129 ldr r12, [sp]
130 1:
131 vld1.8 {q2, q3}, [r2], r3
132 vld1.8 {q0, q1}, [r0, :128]
133 vrhadd.u8 q0, q0, q2
134 vrhadd.u8 q1, q1, q3
135 subs r12, r12, #1
136 vst1.8 {q0, q1}, [r0, :128], r1
137 bne 1b
138 bx lr
139 endfunc
140
141 function ff_vp9_copy16_neon, export=1
142 push {r4,lr}
143 ldr r12, [sp, #8]
144 add r4, r0, r1
145 add lr, r2, r3
146 add r1, r1, r1
147 add r3, r3, r3
148 1:
149 vld1.8 {q0}, [r2], r3
150 vld1.8 {q1}, [lr], r3
151 subs r12, r12, #2
152 vst1.8 {q0}, [r0, :128], r1
153 vst1.8 {q1}, [r4, :128], r1
154 bne 1b
155 pop {r4,pc}
156 endfunc
157
158 function ff_vp9_avg16_neon, export=1
159 ldr r12, [sp]
160 1:
161 vld1.8 {q2}, [r2], r3
162 vld1.8 {q0}, [r0, :128], r1
163 vld1.8 {q3}, [r2], r3
164 vrhadd.u8 q0, q0, q2
165 vld1.8 {q1}, [r0, :128]
166 sub r0, r0, r1
167 vrhadd.u8 q1, q1, q3
168 subs r12, r12, #2
169 vst1.8 {q0}, [r0, :128], r1
170 vst1.8 {q1}, [r0, :128], r1
171 bne 1b
172 bx lr
173 endfunc
174
175 function ff_vp9_copy8_neon, export=1
176 ldr r12, [sp]
177 1:
178 vld1.8 {d0}, [r2], r3
179 vld1.8 {d1}, [r2], r3
180 subs r12, r12, #2
181 vst1.8 {d0}, [r0, :64], r1
182 vst1.8 {d1}, [r0, :64], r1
183 bne 1b
184 bx lr
185 endfunc
186
187 function ff_vp9_avg8_neon, export=1
188 ldr r12, [sp]
189 1:
190 vld1.8 {d2}, [r2], r3
191 vld1.8 {d0}, [r0, :64], r1
192 vld1.8 {d3}, [r2], r3
193 vrhadd.u8 d0, d0, d2
194 vld1.8 {d1}, [r0, :64]
195 sub r0, r0, r1
196 vrhadd.u8 d1, d1, d3
197 subs r12, r12, #2
198 vst1.8 {d0}, [r0, :64], r1
199 vst1.8 {d1}, [r0, :64], r1
200 bne 1b
201 bx lr
202 endfunc
203
204 function ff_vp9_copy4_neon, export=1
205 ldr r12, [sp]
206 1:
207 vld1.32 {d0[]}, [r2], r3
208 vld1.32 {d1[]}, [r2], r3
209 vst1.32 {d0[0]}, [r0, :32], r1
210 vld1.32 {d2[]}, [r2], r3
211 vst1.32 {d1[0]}, [r0, :32], r1
212 vld1.32 {d3[]}, [r2], r3
213 subs r12, r12, #4
214 vst1.32 {d2[0]}, [r0, :32], r1
215 vst1.32 {d3[0]}, [r0, :32], r1
216 bne 1b
217 bx lr
218 endfunc
219
220 function ff_vp9_avg4_neon, export=1
221 ldr r12, [sp]
222 1:
223 vld1.32 {d4[]}, [r2], r3
224 vld1.32 {d0[]}, [r0, :32], r1
225 vld1.32 {d5[]}, [r2], r3
226 vrhadd.u8 d0, d0, d4
227 vld1.32 {d1[]}, [r0, :32], r1
228 vld1.32 {d6[]}, [r2], r3
229 vrhadd.u8 d1, d1, d5
230 vld1.32 {d2[]}, [r0, :32], r1
231 vld1.32 {d7[]}, [r2], r3
232 vrhadd.u8 d2, d2, d6
233 vld1.32 {d3[]}, [r0, :32], r1
234 sub r0, r0, r1, lsl #2
235 subs r12, r12, #4
236 vst1.32 {d0[0]}, [r0, :32], r1
237 vrhadd.u8 d3, d3, d7
238 vst1.32 {d1[0]}, [r0, :32], r1
239 vst1.32 {d2[0]}, [r0, :32], r1
240 vst1.32 {d3[0]}, [r0, :32], r1
241 bne 1b
242 bx lr
243 endfunc
244
245 @ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index
246 .macro vmul_lane dst, src, idx
247 .if \idx < 4
248 vmul.s16 \dst, \src, d0[\idx]
249 .else
250 vmul.s16 \dst, \src, d1[\idx - 4]
251 .endif
252 .endm
253 .macro vmla_lane dst, src, idx
254 .if \idx < 4
255 vmla.s16 \dst, \src, d0[\idx]
256 .else
257 vmla.s16 \dst, \src, d1[\idx - 4]
258 .endif
259 .endm
260
261 @ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
262 @ for size >= 16), and multiply-accumulate into dst1 and dst3 (or
263 @ dst1-dst2 and dst3-dst4 for size >= 16)
264 .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
265 vext.8 q14, \src1, \src2, #(2*\offset)
266 vext.8 q15, \src4, \src5, #(2*\offset)
267 .if \size >= 16
268 vmla_lane \dst1, q14, \offset
269 vext.8 q5, \src2, \src3, #(2*\offset)
270 vmla_lane \dst3, q15, \offset
271 vext.8 q6, \src5, \src6, #(2*\offset)
272 vmla_lane \dst2, q5, \offset
273 vmla_lane \dst4, q6, \offset
274 .else
275 vmla_lane \dst1, q14, \offset
276 vmla_lane \dst3, q15, \offset
277 .endif
278 .endm
279 @ The same as above, but don't accumulate straight into the
280 @ destination, but use a temp register and accumulate with saturation.
281 .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
282 vext.8 q14, \src1, \src2, #(2*\offset)
283 vext.8 q15, \src4, \src5, #(2*\offset)
284 .if \size >= 16
285 vmul_lane q14, q14, \offset
286 vext.8 q5, \src2, \src3, #(2*\offset)
287 vmul_lane q15, q15, \offset
288 vext.8 q6, \src5, \src6, #(2*\offset)
289 vmul_lane q5, q5, \offset
290 vmul_lane q6, q6, \offset
291 .else
292 vmul_lane q14, q14, \offset
293 vmul_lane q15, q15, \offset
294 .endif
295 vqadd.s16 \dst1, \dst1, q14
296 vqadd.s16 \dst3, \dst3, q15
297 .if \size >= 16
298 vqadd.s16 \dst2, \dst2, q5
299 vqadd.s16 \dst4, \dst4, q6
300 .endif
301 .endm
302
303
304 @ Instantiate a horizontal filter function for the given size.
305 @ This can work on 4, 8 or 16 pixels in parallel; for larger
306 @ widths it will do 16 pixels at a time and loop horizontally.
307 @ The actual width is passed in r5, the height in r4 and
308 @ the filter coefficients in r12. idx2 is the index of the largest
309 @ filter coefficient (3 or 4) and idx1 is the other one of them.
310 .macro do_8tap_h type, size, idx1, idx2
311 function \type\()_8tap_\size\()h_\idx1\idx2
312 sub r2, r2, #3
313 add r6, r0, r1
314 add r7, r2, r3
315 add r1, r1, r1
316 add r3, r3, r3
317 @ Only size >= 16 loops horizontally and needs
318 @ reduced dst stride
319 .if \size >= 16
320 sub r1, r1, r5
321 .endif
322 @ size >= 16 loads two qwords and increments r2,
323 @ for size 4/8 it's enough with one qword and no
324 @ postincrement
325 .if \size >= 16
326 sub r3, r3, r5
327 sub r3, r3, #8
328 .endif
329 @ Load the filter vector
330 vld1.16 {q0}, [r12,:128]
331 1:
332 .if \size >= 16
333 mov r12, r5
334 .endif
335 @ Load src
336 .if \size >= 16
337 vld1.8 {d18, d19, d20}, [r2]!
338 vld1.8 {d24, d25, d26}, [r7]!
339 .else
340 vld1.8 {q9}, [r2]
341 vld1.8 {q12}, [r7]
342 .endif
343 vmovl.u8 q8, d18
344 vmovl.u8 q9, d19
345 vmovl.u8 q11, d24
346 vmovl.u8 q12, d25
347 .if \size >= 16
348 vmovl.u8 q10, d20
349 vmovl.u8 q13, d26
350 .endif
351 2:
352
353 @ Accumulate, adding idx2 last with a separate
354 @ saturating add. The positive filter coefficients
355 @ for all indices except idx2 must add up to less
356 @ than 127 for this not to overflow.
357 vmul.s16 q1, q8, d0[0]
358 vmul.s16 q3, q11, d0[0]
359 .if \size >= 16
360 vmul.s16 q2, q9, d0[0]
361 vmul.s16 q4, q12, d0[0]
362 .endif
363 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 1, \size
364 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 2, \size
365 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, \idx1, \size
366 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 5, \size
367 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 6, \size
368 extmla q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, 7, \size
369 extmulqadd q1, q2, q3, q4, q8, q9, q10, q11, q12, q13, \idx2, \size
370
371 @ Round, shift and saturate
372 vqrshrun.s16 d2, q1, #7
373 vqrshrun.s16 d6, q3, #7
374 .if \size >= 16
375 vqrshrun.s16 d3, q2, #7
376 vqrshrun.s16 d7, q4, #7
377 .endif
378 @ Average
379 .ifc \type,avg
380 .if \size >= 16
381 vld1.8 {q14}, [r0,:128]
382 vld1.8 {q15}, [r6,:128]
383 vrhadd.u8 q1, q1, q14
384 vrhadd.u8 q3, q3, q15
385 .elseif \size == 8
386 vld1.8 {d28}, [r0,:64]
387 vld1.8 {d30}, [r6,:64]
388 vrhadd.u8 d2, d2, d28
389 vrhadd.u8 d6, d6, d30
390 .else
391 @ We only need d28[0], but [] is faster on some cores
392 vld1.32 {d28[]}, [r0,:32]
393 vld1.32 {d30[]}, [r6,:32]
394 vrhadd.u8 d2, d2, d28
395 vrhadd.u8 d6, d6, d30
396 .endif
397 .endif
398 @ Store and loop horizontally (for size >= 16)
399 .if \size >= 16
400 vst1.8 {q1}, [r0,:128]!
401 vst1.8 {q3}, [r6,:128]!
402 vmov q8, q10
403 vmov q11, q13
404 subs r12, r12, #16
405 beq 3f
406 vld1.8 {q10}, [r2]!
407 vld1.8 {q13}, [r7]!
408 vmovl.u8 q9, d20
409 vmovl.u8 q10, d21
410 vmovl.u8 q12, d26
411 vmovl.u8 q13, d27
412 b 2b
413 .elseif \size == 8
414 vst1.8 {d2}, [r0,:64]
415 vst1.8 {d6}, [r6,:64]
416 .else @ \size == 4
417 vst1.32 {d2[0]}, [r0,:32]
418 vst1.32 {d6[0]}, [r6,:32]
419 .endif
420 3:
421 @ Loop vertically
422 add r0, r0, r1
423 add r6, r6, r1
424 add r2, r2, r3
425 add r7, r7, r3
426 subs r4, r4, #2
427 bne 1b
428 .if \size >= 16
429 vpop {q4-q6}
430 .endif
431 pop {r4-r7}
432 bx lr
433 endfunc
434 .endm
435
436 .macro do_8tap_h_size size
437 do_8tap_h put, \size, 3, 4
438 do_8tap_h avg, \size, 3, 4
439 do_8tap_h put, \size, 4, 3
440 do_8tap_h avg, \size, 4, 3
441 .endm
442
443 do_8tap_h_size 4
444 do_8tap_h_size 8
445 do_8tap_h_size 16
446
447 .macro do_8tap_h_func type, filter, size
448 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
449 push {r4-r7}
450 .if \size >= 16
451 vpush {q4-q6}
452 ldr r4, [sp, #64]
453 ldr r5, [sp, #68]
454 .else
455 ldr r4, [sp, #16]
456 ldr r5, [sp, #20]
457 .endif
458 movrel r12, \filter\()_filter-16
459 cmp r5, #8
460 add r12, r12, r5, lsl #4
461 mov r5, #\size
462 .if \size >= 16
463 bge \type\()_8tap_16h_34
464 b \type\()_8tap_16h_43
465 .else
466 bge \type\()_8tap_\size\()h_34
467 b \type\()_8tap_\size\()h_43
468 .endif
469 endfunc
470 .endm
471
472 .macro do_8tap_h_filters size
473 do_8tap_h_func put, regular, \size
474 do_8tap_h_func avg, regular, \size
475 do_8tap_h_func put, sharp, \size
476 do_8tap_h_func avg, sharp, \size
477 do_8tap_h_func put, smooth, \size
478 do_8tap_h_func avg, smooth, \size
479 .endm
480
481 do_8tap_h_filters 64
482 do_8tap_h_filters 32
483 do_8tap_h_filters 16
484 do_8tap_h_filters 8
485 do_8tap_h_filters 4
486
487 .ltorg
488
489 @ Vertical filters
490
491 @ Round, shift and saturate and store qreg1-2 over 4 lines
492 .macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type
493 vqrshrun.s16 \dreg1, \qreg1, #7
494 vqrshrun.s16 \dreg2, \qreg2, #7
495 .ifc \type,avg
496 vld1.32 {\tmp1[]}, [r0,:32], r1
497 vld1.32 {\tmp2[]}, [r0,:32], r1
498 vld1.32 {\tmp1[1]}, [r0,:32], r1
499 vld1.32 {\tmp2[1]}, [r0,:32], r1
500 vrhadd.u8 \dreg1, \dreg1, \tmp1
501 vrhadd.u8 \dreg2, \dreg2, \tmp2
502 sub r0, r0, r1, lsl #2
503 .endif
504 vst1.32 {\dreg1[0]}, [r0,:32], r1
505 vst1.32 {\dreg2[0]}, [r0,:32], r1
506 vst1.32 {\dreg1[1]}, [r0,:32], r1
507 vst1.32 {\dreg2[1]}, [r0,:32], r1
508 .endm
509
510 @ Round, shift and saturate and store qreg1-4
511 .macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type
512 vqrshrun.s16 \dreg1, \qreg1, #7
513 vqrshrun.s16 \dreg2, \qreg2, #7
514 vqrshrun.s16 \dreg3, \qreg3, #7
515 vqrshrun.s16 \dreg4, \qreg4, #7
516 .ifc \type,avg
517 vld1.8 {\tmp1}, [r0,:64], r1
518 vld1.8 {\tmp2}, [r0,:64], r1
519 vld1.8 {\tmp3}, [r0,:64], r1
520 vld1.8 {\tmp4}, [r0,:64], r1
521 vrhadd.u8 \dreg1, \dreg1, \tmp1
522 vrhadd.u8 \dreg2, \dreg2, \tmp2
523 vrhadd.u8 \dreg3, \dreg3, \tmp3
524 vrhadd.u8 \dreg4, \dreg4, \tmp4
525 sub r0, r0, r1, lsl #2
526 .endif
527 vst1.8 {\dreg1}, [r0,:64], r1
528 vst1.8 {\dreg2}, [r0,:64], r1
529 vst1.8 {\dreg3}, [r0,:64], r1
530 vst1.8 {\dreg4}, [r0,:64], r1
531 .endm
532
533 @ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
534 @ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
535 @ at the end with saturation. Indices 0 and 7 always have negative or zero
536 @ coefficients, so they can be accumulated into tmp1-tmp2 together with the
537 @ largest coefficient.
538 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
539 vmul.s16 \dst1, \src2, d0[1]
540 vmul.s16 \dst2, \src3, d0[1]
541 vmul.s16 \tmp1, \src1, d0[0]
542 vmul.s16 \tmp2, \src2, d0[0]
543 vmla.s16 \dst1, \src3, d0[2]
544 vmla.s16 \dst2, \src4, d0[2]
545 .if \idx1 == 3
546 vmla.s16 \dst1, \src4, d0[3]
547 vmla.s16 \dst2, \src5, d0[3]
548 .else
549 vmla.s16 \dst1, \src5, d1[0]
550 vmla.s16 \dst2, \src6, d1[0]
551 .endif
552 vmla.s16 \dst1, \src6, d1[1]
553 vmla.s16 \dst2, \src7, d1[1]
554 vmla.s16 \tmp1, \src8, d1[3]
555 vmla.s16 \tmp2, \src9, d1[3]
556 vmla.s16 \dst1, \src7, d1[2]
557 vmla.s16 \dst2, \src8, d1[2]
558 .if \idx2 == 3
559 vmla.s16 \tmp1, \src4, d0[3]
560 vmla.s16 \tmp2, \src5, d0[3]
561 .else
562 vmla.s16 \tmp1, \src5, d1[0]
563 vmla.s16 \tmp2, \src6, d1[0]
564 .endif
565 vqadd.s16 \dst1, \dst1, \tmp1
566 vqadd.s16 \dst2, \dst2, \tmp2
567 .endm
568
569 @ Load pixels and extend them to 16 bit
570 .macro loadl dst1, dst2, dst3, dst4
571 vld1.8 {d2}, [r2], r3
572 vld1.8 {d3}, [r2], r3
573 vld1.8 {d4}, [r2], r3
574 .ifnb \dst4
575 vld1.8 {d5}, [r2], r3
576 .endif
577 vmovl.u8 \dst1, d2
578 vmovl.u8 \dst2, d3
579 vmovl.u8 \dst3, d4
580 .ifnb \dst4
581 vmovl.u8 \dst4, d5
582 .endif
583 .endm
584
585 @ Instantiate a vertical filter function for filtering 8 pixels at a time.
586 @ The height is passed in r4, the width in r5 and the filter coefficients
587 @ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
588 @ and idx1 is the other one of them.
589 .macro do_8tap_8v type, idx1, idx2
590 function \type\()_8tap_8v_\idx1\idx2
591 sub r2, r2, r3, lsl #1
592 sub r2, r2, r3
593 vld1.16 {q0}, [r12, :128]
594 1:
595 mov r12, r4
596
597 loadl q5, q6, q7
598 loadl q8, q9, q10, q11
599 2:
600 loadl q12, q13, q14, q15
601 convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q5
602 convolve q3, q4, q7, q8, q9, q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5, q6
603 do_store q1, d2, q2, d4, q3, d6, q4, d8, d3, d5, d7, d9, \type
604
605 subs r12, r12, #4
606 beq 8f
607
608 loadl q4, q5, q6, q7
609 convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q4, q5, \idx1, \idx2, q8, q9
610 convolve q3, q8, q11, q12, q13, q14, q15, q4, q5, q6, q7, \idx1, \idx2, q9, q10
611 do_store q1, d2, q2, d4, q3, d6, q8, d16, d3, d5, d7, d17, \type
612
613 subs r12, r12, #4
614 beq 8f
615
616 loadl q8, q9, q10, q11
617 convolve q1, q2, q13, q14, q15, q4, q5, q6, q7, q8, q9, \idx1, \idx2, q12, q13
618 convolve q3, q12, q15, q4, q5, q6, q7, q8, q9, q10, q11, \idx1, \idx2, q13, q14
619 do_store q1, d2, q2, d4, q3, d6, q12, d24, d3, d5, d7, d25, \type
620
621 subs r12, r12, #4
622 bne 2b
623
624 8:
625 subs r5, r5, #8
626 beq 9f
627 @ r0 -= h * dst_stride
628 mls r0, r1, r4, r0
629 @ r2 -= h * src_stride
630 mls r2, r3, r4, r2
631 @ r2 -= 8 * src_stride
632 sub r2, r2, r3, lsl #3
633 @ r2 += 1 * src_stride
634 add r2, r2, r3
635 add r2, r2, #8
636 add r0, r0, #8
637 b 1b
638 9:
639 vpop {q4-q7}
640 pop {r4-r5}
641 bx lr
642 endfunc
643 .endm
644
645 do_8tap_8v put, 3, 4
646 do_8tap_8v put, 4, 3
647 do_8tap_8v avg, 3, 4
648 do_8tap_8v avg, 4, 3
649
650 @ Instantiate a vertical filter function for filtering a 4 pixels wide
651 @ slice. The first half of the registers contain one row, while the second
652 @ half of a register contains the second-next row (also stored in the first
653 @ half of the register two steps ahead). The convolution does two outputs
654 @ at a time; the output of q5-q12 into one, and q4-q13 into another one.
655 @ The first half of first output is the first output row, the first half
656 @ of the other output is the second output row. The second halves of the
657 @ registers are rows 3 and 4.
658 @ This only is designed to work for 4 or 8 output lines.
659 .macro do_8tap_4v type, idx1, idx2
660 function \type\()_8tap_4v_\idx1\idx2
661 sub r2, r2, r3, lsl #1
662 sub r2, r2, r3
663 vld1.16 {q0}, [r12, :128]
664
665 vld1.32 {d2[]}, [r2], r3
666 vld1.32 {d3[]}, [r2], r3
667 vld1.32 {d4[]}, [r2], r3
668 vld1.32 {d5[]}, [r2], r3
669 vld1.32 {d6[]}, [r2], r3
670 vld1.32 {d7[]}, [r2], r3
671 vext.8 d2, d2, d4, #4
672 vld1.32 {d8[]}, [r2], r3
673 vext.8 d3, d3, d5, #4
674 vld1.32 {d9[]}, [r2], r3
675 vmovl.u8 q5, d2
676 vext.8 d4, d4, d6, #4
677 vld1.32 {d28[]}, [r2], r3
678 vmovl.u8 q6, d3
679 vext.8 d5, d5, d7, #4
680 vld1.32 {d29[]}, [r2], r3
681 vmovl.u8 q7, d4
682 vext.8 d6, d6, d8, #4
683 vld1.32 {d30[]}, [r2], r3
684 vmovl.u8 q8, d5
685 vext.8 d7, d7, d9, #4
686 vmovl.u8 q9, d6
687 vext.8 d8, d8, d28, #4
688 vmovl.u8 q10, d7
689 vext.8 d9, d9, d29, #4
690 vmovl.u8 q11, d8
691 vext.8 d28, d28, d30, #4
692 vmovl.u8 q12, d9
693 vmovl.u8 q13, d28
694
695 convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q3
696 do_store4 q1, d2, q2, d4, d3, d5, \type
697 subs r4, r4, #4
698 beq 9f
699
700 vld1.32 {d2[]}, [r2], r3
701 vld1.32 {d3[]}, [r2], r3
702 vext.8 d29, d29, d2, #4
703 vext.8 d30, d30, d3, #4
704 vld1.32 {d2[1]}, [r2], r3
705 vmovl.u8 q14, d29
706 vld1.32 {d3[1]}, [r2], r3
707 vmovl.u8 q15, d30
708 vmovl.u8 q5, d2
709 vmovl.u8 q6, d3
710
711 convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q5, q6, \idx1, \idx2, q4, q3
712 do_store4 q1, d2, q2, d4, d3, d5, \type
713
714 9:
715 vpop {q4-q7}
716 pop {r4-r5}
717 bx lr
718 endfunc
719 .endm
720
721 do_8tap_4v put, 3, 4
722 do_8tap_4v put, 4, 3
723 do_8tap_4v avg, 3, 4
724 do_8tap_4v avg, 4, 3
725
726 .macro do_8tap_v_func type, filter, size
727 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
728 push {r4-r5}
729 vpush {q4-q7}
730 ldr r4, [sp, #72]
731 ldr r5, [sp, #80]
732 movrel r12, \filter\()_filter-16
733 add r12, r12, r5, lsl #4
734 cmp r5, #8
735 mov r5, #\size
736 .if \size >= 8
737 bge \type\()_8tap_8v_34
738 b \type\()_8tap_8v_43
739 .else
740 bge \type\()_8tap_4v_34
741 b \type\()_8tap_4v_43
742 .endif
743 endfunc
744 .endm
745
746 .macro do_8tap_v_filters size
747 do_8tap_v_func put, regular, \size
748 do_8tap_v_func avg, regular, \size
749 do_8tap_v_func put, sharp, \size
750 do_8tap_v_func avg, sharp, \size
751 do_8tap_v_func put, smooth, \size
752 do_8tap_v_func avg, smooth, \size
753 .endm
754
755 do_8tap_v_filters 64
756 do_8tap_v_filters 32
757 do_8tap_v_filters 16
758 do_8tap_v_filters 8
759 do_8tap_v_filters 4