aarch64: vp9mc: Fix a comment to refer to a register with the right name
[libav.git] / libavcodec / aarch64 / vp9mc_neon.S
1 /*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/aarch64/asm.S"
22
23 // All public functions in this file have the following signature:
24 // typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
25 // const uint8_t *ref, ptrdiff_t ref_stride,
26 // int h, int mx, int my);
27
28 function ff_vp9_copy64_aarch64, export=1
29 1:
30 ldp x5, x6, [x2]
31 ldp x7, x8, [x2, #16]
32 stp x5, x6, [x0]
33 ldp x9, x10, [x2, #32]
34 stp x7, x8, [x0, #16]
35 subs w4, w4, #1
36 ldp x11, x12, [x2, #48]
37 stp x9, x10, [x0, #32]
38 stp x11, x12, [x0, #48]
39 add x2, x2, x3
40 add x0, x0, x1
41 b.ne 1b
42 ret
43 endfunc
44
45 function ff_vp9_avg64_neon, export=1
46 mov x5, x0
47 1:
48 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3
49 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
50 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
51 urhadd v0.16b, v0.16b, v4.16b
52 urhadd v1.16b, v1.16b, v5.16b
53 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
54 urhadd v2.16b, v2.16b, v6.16b
55 urhadd v3.16b, v3.16b, v7.16b
56 subs w4, w4, #2
57 urhadd v16.16b, v16.16b, v20.16b
58 urhadd v17.16b, v17.16b, v21.16b
59 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1
60 urhadd v18.16b, v18.16b, v22.16b
61 urhadd v19.16b, v19.16b, v23.16b
62 st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
63 b.ne 1b
64 ret
65 endfunc
66
67 function ff_vp9_copy32_aarch64, export=1
68 1:
69 ldp x5, x6, [x2]
70 ldp x7, x8, [x2, #16]
71 stp x5, x6, [x0]
72 subs w4, w4, #1
73 stp x7, x8, [x0, #16]
74 add x2, x2, x3
75 add x0, x0, x1
76 b.ne 1b
77 ret
78 endfunc
79
80 function ff_vp9_avg32_neon, export=1
81 1:
82 ld1 {v2.16b, v3.16b}, [x2], x3
83 ld1 {v0.16b, v1.16b}, [x0]
84 urhadd v0.16b, v0.16b, v2.16b
85 urhadd v1.16b, v1.16b, v3.16b
86 subs w4, w4, #1
87 st1 {v0.16b, v1.16b}, [x0], x1
88 b.ne 1b
89 ret
90 endfunc
91
92 function ff_vp9_copy16_neon, export=1
93 add x5, x0, x1
94 lsl x1, x1, #1
95 add x6, x2, x3
96 lsl x3, x3, #1
97 1:
98 ld1 {v0.16b}, [x2], x3
99 ld1 {v1.16b}, [x6], x3
100 ld1 {v2.16b}, [x2], x3
101 ld1 {v3.16b}, [x6], x3
102 subs w4, w4, #4
103 st1 {v0.16b}, [x0], x1
104 st1 {v1.16b}, [x5], x1
105 st1 {v2.16b}, [x0], x1
106 st1 {v3.16b}, [x5], x1
107 b.ne 1b
108 ret
109 endfunc
110
111 function ff_vp9_avg16_neon, export=1
112 mov x5, x0
113 1:
114 ld1 {v2.16b}, [x2], x3
115 ld1 {v0.16b}, [x0], x1
116 ld1 {v3.16b}, [x2], x3
117 urhadd v0.16b, v0.16b, v2.16b
118 ld1 {v1.16b}, [x0], x1
119 urhadd v1.16b, v1.16b, v3.16b
120 subs w4, w4, #2
121 st1 {v0.16b}, [x5], x1
122 st1 {v1.16b}, [x5], x1
123 b.ne 1b
124 ret
125 endfunc
126
127 function ff_vp9_copy8_neon, export=1
128 1:
129 ld1 {v0.8b}, [x2], x3
130 ld1 {v1.8b}, [x2], x3
131 subs w4, w4, #2
132 st1 {v0.8b}, [x0], x1
133 st1 {v1.8b}, [x0], x1
134 b.ne 1b
135 ret
136 endfunc
137
138 function ff_vp9_avg8_neon, export=1
139 mov x5, x0
140 1:
141 ld1 {v2.8b}, [x2], x3
142 ld1 {v0.8b}, [x0], x1
143 ld1 {v3.8b}, [x2], x3
144 urhadd v0.8b, v0.8b, v2.8b
145 ld1 {v1.8b}, [x0], x1
146 urhadd v1.8b, v1.8b, v3.8b
147 subs w4, w4, #2
148 st1 {v0.8b}, [x5], x1
149 st1 {v1.8b}, [x5], x1
150 b.ne 1b
151 ret
152 endfunc
153
154 function ff_vp9_copy4_neon, export=1
155 1:
156 ld1 {v0.s}[0], [x2], x3
157 ld1 {v1.s}[0], [x2], x3
158 st1 {v0.s}[0], [x0], x1
159 ld1 {v2.s}[0], [x2], x3
160 st1 {v1.s}[0], [x0], x1
161 ld1 {v3.s}[0], [x2], x3
162 subs w4, w4, #4
163 st1 {v2.s}[0], [x0], x1
164 st1 {v3.s}[0], [x0], x1
165 b.ne 1b
166 ret
167 endfunc
168
169 function ff_vp9_avg4_neon, export=1
170 mov x5, x0
171 1:
172 ld1 {v2.s}[0], [x2], x3
173 ld1 {v0.s}[0], [x0], x1
174 ld1 {v2.s}[1], [x2], x3
175 ld1 {v0.s}[1], [x0], x1
176 ld1 {v3.s}[0], [x2], x3
177 ld1 {v1.s}[0], [x0], x1
178 ld1 {v3.s}[1], [x2], x3
179 ld1 {v1.s}[1], [x0], x1
180 subs w4, w4, #4
181 urhadd v0.8b, v0.8b, v2.8b
182 urhadd v1.8b, v1.8b, v3.8b
183 st1 {v0.s}[0], [x5], x1
184 st1 {v0.s}[1], [x5], x1
185 st1 {v1.s}[0], [x5], x1
186 st1 {v1.s}[1], [x5], x1
187 b.ne 1b
188 ret
189 endfunc
190
191
192 // Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
193 // for size >= 16), and multiply-accumulate into dst1 and dst3 (or
194 // dst1-dst2 and dst3-dst4 for size >= 16)
195 .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
196 ext v20.16b, \src1, \src2, #(2*\offset)
197 ext v22.16b, \src4, \src5, #(2*\offset)
198 .if \size >= 16
199 mla \dst1, v20.8h, v0.h[\offset]
200 ext v21.16b, \src2, \src3, #(2*\offset)
201 mla \dst3, v22.8h, v0.h[\offset]
202 ext v23.16b, \src5, \src6, #(2*\offset)
203 mla \dst2, v21.8h, v0.h[\offset]
204 mla \dst4, v23.8h, v0.h[\offset]
205 .else
206 mla \dst1, v20.8h, v0.h[\offset]
207 mla \dst3, v22.8h, v0.h[\offset]
208 .endif
209 .endm
210 // The same as above, but don't accumulate straight into the
211 // destination, but use a temp register and accumulate with saturation.
212 .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
213 ext v20.16b, \src1, \src2, #(2*\offset)
214 ext v22.16b, \src4, \src5, #(2*\offset)
215 .if \size >= 16
216 mul v20.8h, v20.8h, v0.h[\offset]
217 ext v21.16b, \src2, \src3, #(2*\offset)
218 mul v22.8h, v22.8h, v0.h[\offset]
219 ext v23.16b, \src5, \src6, #(2*\offset)
220 mul v21.8h, v21.8h, v0.h[\offset]
221 mul v23.8h, v23.8h, v0.h[\offset]
222 .else
223 mul v20.8h, v20.8h, v0.h[\offset]
224 mul v22.8h, v22.8h, v0.h[\offset]
225 .endif
226 sqadd \dst1, \dst1, v20.8h
227 sqadd \dst3, \dst3, v22.8h
228 .if \size >= 16
229 sqadd \dst2, \dst2, v21.8h
230 sqadd \dst4, \dst4, v23.8h
231 .endif
232 .endm
233
234
235 // Instantiate a horizontal filter function for the given size.
236 // This can work on 4, 8 or 16 pixels in parallel; for larger
237 // widths it will do 16 pixels at a time and loop horizontally.
238 // The actual width is passed in x5, the height in w4 and the
239 // filter coefficients in x9. idx2 is the index of the largest
240 // filter coefficient (3 or 4) and idx1 is the other one of them.
241 .macro do_8tap_h type, size, idx1, idx2
242 function \type\()_8tap_\size\()h_\idx1\idx2
243 sub x2, x2, #3
244 add x6, x0, x1
245 add x7, x2, x3
246 add x1, x1, x1
247 add x3, x3, x3
248 // Only size >= 16 loops horizontally and needs
249 // reduced dst stride
250 .if \size >= 16
251 sub x1, x1, x5
252 .endif
253 // size >= 16 loads two qwords and increments x2,
254 // for size 4/8 it's enough with one qword and no
255 // postincrement
256 .if \size >= 16
257 sub x3, x3, x5
258 sub x3, x3, #8
259 .endif
260 // Load the filter vector
261 ld1 {v0.8b}, [x9]
262 sxtl v0.8h, v0.8b
263 1:
264 .if \size >= 16
265 mov x9, x5
266 .endif
267 // Load src
268 .if \size >= 16
269 ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24
270 ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24
271 .else
272 ld1 {v4.8b, v5.8b}, [x2]
273 ld1 {v16.8b, v17.8b}, [x7]
274 .endif
275 uxtl v4.8h, v4.8b
276 uxtl v5.8h, v5.8b
277 uxtl v16.8h, v16.8b
278 uxtl v17.8h, v17.8b
279 .if \size >= 16
280 uxtl v6.8h, v6.8b
281 uxtl v18.8h, v18.8b
282 .endif
283 2:
284
285 // Accumulate, adding idx2 last with a separate
286 // saturating add. The positive filter coefficients
287 // for all indices except idx2 must add up to less
288 // than 127 for this not to overflow.
289 mul v1.8h, v4.8h, v0.h[0]
290 mul v24.8h, v16.8h, v0.h[0]
291 .if \size >= 16
292 mul v2.8h, v5.8h, v0.h[0]
293 mul v25.8h, v17.8h, v0.h[0]
294 .endif
295 extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, 1, \size
296 extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, 2, \size
297 extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, \idx1, \size
298 extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, 5, \size
299 extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, 6, \size
300 extmla v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, 7, \size
301 extmulqadd v1.8h, v2.8h, v24.8h, v25.8h, v4.16b, v5.16b, v6.16b, v16.16b, v17.16b, v18.16b, \idx2, \size
302
303 // Round, shift and saturate
304 sqrshrun v1.8b, v1.8h, #7
305 sqrshrun v24.8b, v24.8h, #7
306 .if \size >= 16
307 sqrshrun2 v1.16b, v2.8h, #7
308 sqrshrun2 v24.16b, v25.8h, #7
309 .endif
310 // Average
311 .ifc \type,avg
312 .if \size >= 16
313 ld1 {v2.16b}, [x0]
314 ld1 {v3.16b}, [x6]
315 urhadd v1.16b, v1.16b, v2.16b
316 urhadd v24.16b, v24.16b, v3.16b
317 .elseif \size == 8
318 ld1 {v2.8b}, [x0]
319 ld1 {v3.8b}, [x6]
320 urhadd v1.8b, v1.8b, v2.8b
321 urhadd v24.8b, v24.8b, v3.8b
322 .else
323 ld1 {v2.s}[0], [x0]
324 ld1 {v3.s}[0], [x6]
325 urhadd v1.8b, v1.8b, v2.8b
326 urhadd v24.8b, v24.8b, v3.8b
327 .endif
328 .endif
329 // Store and loop horizontally (for size >= 16)
330 .if \size >= 16
331 subs x9, x9, #16
332 st1 {v1.16b}, [x0], #16
333 st1 {v24.16b}, [x6], #16
334 beq 3f
335 mov v4.16b, v6.16b
336 mov v16.16b, v18.16b
337 ld1 {v6.16b}, [x2], #16
338 ld1 {v18.16b}, [x7], #16
339 uxtl v5.8h, v6.8b
340 uxtl2 v6.8h, v6.16b
341 uxtl v17.8h, v18.8b
342 uxtl2 v18.8h, v18.16b
343 b 2b
344 .elseif \size == 8
345 st1 {v1.8b}, [x0]
346 st1 {v24.8b}, [x6]
347 .else // \size == 4
348 st1 {v1.s}[0], [x0]
349 st1 {v24.s}[0], [x6]
350 .endif
351 3:
352 // Loop vertically
353 add x0, x0, x1
354 add x6, x6, x1
355 add x2, x2, x3
356 add x7, x7, x3
357 subs w4, w4, #2
358 b.ne 1b
359 ret
360 endfunc
361 .endm
362
363 .macro do_8tap_h_size size
364 do_8tap_h put, \size, 3, 4
365 do_8tap_h avg, \size, 3, 4
366 do_8tap_h put, \size, 4, 3
367 do_8tap_h avg, \size, 4, 3
368 .endm
369
370 do_8tap_h_size 4
371 do_8tap_h_size 8
372 do_8tap_h_size 16
373
374 .macro do_8tap_h_func type, filter, offset, size
375 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
376 movrel x6, X(ff_vp9_subpel_filters), 120*\offset - 8
377 cmp w5, #8
378 add x9, x6, w5, uxtw #3
379 mov x5, #\size
380 .if \size >= 16
381 bge \type\()_8tap_16h_34
382 b \type\()_8tap_16h_43
383 .else
384 bge \type\()_8tap_\size\()h_34
385 b \type\()_8tap_\size\()h_43
386 .endif
387 endfunc
388 .endm
389
390 .macro do_8tap_h_filters size
391 do_8tap_h_func put, regular, 1, \size
392 do_8tap_h_func avg, regular, 1, \size
393 do_8tap_h_func put, sharp, 2, \size
394 do_8tap_h_func avg, sharp, 2, \size
395 do_8tap_h_func put, smooth, 0, \size
396 do_8tap_h_func avg, smooth, 0, \size
397 .endm
398
399 do_8tap_h_filters 64
400 do_8tap_h_filters 32
401 do_8tap_h_filters 16
402 do_8tap_h_filters 8
403 do_8tap_h_filters 4
404
405
406 // Vertical filters
407
408 // Round, shift and saturate and store reg1-reg2 over 4 lines
409 .macro do_store4 reg1, reg2, tmp1, tmp2, type
410 sqrshrun \reg1\().8b, \reg1\().8h, #7
411 sqrshrun \reg2\().8b, \reg2\().8h, #7
412 .ifc \type,avg
413 ld1 {\tmp1\().s}[0], [x7], x1
414 ld1 {\tmp2\().s}[0], [x7], x1
415 ld1 {\tmp1\().s}[1], [x7], x1
416 ld1 {\tmp2\().s}[1], [x7], x1
417 urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
418 urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
419 .endif
420 st1 {\reg1\().s}[0], [x0], x1
421 st1 {\reg2\().s}[0], [x0], x1
422 st1 {\reg1\().s}[1], [x0], x1
423 st1 {\reg2\().s}[1], [x0], x1
424 .endm
425
426 // Round, shift and saturate and store reg1-4
427 .macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
428 sqrshrun \reg1\().8b, \reg1\().8h, #7
429 sqrshrun \reg2\().8b, \reg2\().8h, #7
430 sqrshrun \reg3\().8b, \reg3\().8h, #7
431 sqrshrun \reg4\().8b, \reg4\().8h, #7
432 .ifc \type,avg
433 ld1 {\tmp1\().8b}, [x7], x1
434 ld1 {\tmp2\().8b}, [x7], x1
435 ld1 {\tmp3\().8b}, [x7], x1
436 ld1 {\tmp4\().8b}, [x7], x1
437 urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
438 urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
439 urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b
440 urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b
441 .endif
442 st1 {\reg1\().8b}, [x0], x1
443 st1 {\reg2\().8b}, [x0], x1
444 st1 {\reg3\().8b}, [x0], x1
445 st1 {\reg4\().8b}, [x0], x1
446 .endm
447
448 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
449 // (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
450 // at the end with saturation. Indices 0 and 7 always have negative or zero
451 // coefficients, so they can be accumulated into tmp1-tmp2 together with the
452 // largest coefficient.
453 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
454 mul \dst1\().8h, \src2\().8h, v0.h[1]
455 mul \dst2\().8h, \src3\().8h, v0.h[1]
456 mul \tmp1\().8h, \src1\().8h, v0.h[0]
457 mul \tmp2\().8h, \src2\().8h, v0.h[0]
458 mla \dst1\().8h, \src3\().8h, v0.h[2]
459 mla \dst2\().8h, \src4\().8h, v0.h[2]
460 .if \idx1 == 3
461 mla \dst1\().8h, \src4\().8h, v0.h[3]
462 mla \dst2\().8h, \src5\().8h, v0.h[3]
463 .else
464 mla \dst1\().8h, \src5\().8h, v0.h[4]
465 mla \dst2\().8h, \src6\().8h, v0.h[4]
466 .endif
467 mla \dst1\().8h, \src6\().8h, v0.h[5]
468 mla \dst2\().8h, \src7\().8h, v0.h[5]
469 mla \tmp1\().8h, \src8\().8h, v0.h[7]
470 mla \tmp2\().8h, \src9\().8h, v0.h[7]
471 mla \dst1\().8h, \src7\().8h, v0.h[6]
472 mla \dst2\().8h, \src8\().8h, v0.h[6]
473 .if \idx2 == 3
474 mla \tmp1\().8h, \src4\().8h, v0.h[3]
475 mla \tmp2\().8h, \src5\().8h, v0.h[3]
476 .else
477 mla \tmp1\().8h, \src5\().8h, v0.h[4]
478 mla \tmp2\().8h, \src6\().8h, v0.h[4]
479 .endif
480 sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h
481 sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h
482 .endm
483
484 // Load pixels and extend them to 16 bit
485 .macro loadl dst1, dst2, dst3, dst4
486 ld1 {v1.8b}, [x2], x3
487 ld1 {v2.8b}, [x2], x3
488 ld1 {v3.8b}, [x2], x3
489 .ifnb \dst4
490 ld1 {v4.8b}, [x2], x3
491 .endif
492 uxtl \dst1\().8h, v1.8b
493 uxtl \dst2\().8h, v2.8b
494 uxtl \dst3\().8h, v3.8b
495 .ifnb \dst4
496 uxtl \dst4\().8h, v4.8b
497 .endif
498 .endm
499
500 // Instantiate a vertical filter function for filtering 8 pixels at a time.
501 // The height is passed in x4, the width in x5 and the filter coefficients
502 // in x6. idx2 is the index of the largest filter coefficient (3 or 4)
503 // and idx1 is the other one of them.
504 .macro do_8tap_8v type, idx1, idx2
505 function \type\()_8tap_8v_\idx1\idx2
506 sub x2, x2, x3, lsl #1
507 sub x2, x2, x3
508 ld1 {v0.8b}, [x6]
509 sxtl v0.8h, v0.8b
510 1:
511 .ifc \type,avg
512 mov x7, x0
513 .endif
514 mov x6, x4
515
516 loadl v17, v18, v19
517
518 loadl v20, v21, v22, v23
519 2:
520 loadl v24, v25, v26, v27
521 convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6
522 convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6
523 do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
524
525 subs x6, x6, #4
526 b.eq 8f
527
528 loadl v16, v17, v18, v19
529 convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6
530 convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6
531 do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
532
533 subs x6, x6, #4
534 b.eq 8f
535
536 loadl v20, v21, v22, v23
537 convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6
538 convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6
539 do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
540
541 subs x6, x6, #4
542 b.ne 2b
543
544 8:
545 subs x5, x5, #8
546 b.eq 9f
547 // x0 -= h * dst_stride
548 msub x0, x1, x4, x0
549 // x2 -= h * src_stride
550 msub x2, x3, x4, x2
551 // x2 -= 8 * src_stride
552 sub x2, x2, x3, lsl #3
553 // x2 += 1 * src_stride
554 add x2, x2, x3
555 add x2, x2, #8
556 add x0, x0, #8
557 b 1b
558 9:
559 ret
560 endfunc
561 .endm
562
563 do_8tap_8v put, 3, 4
564 do_8tap_8v put, 4, 3
565 do_8tap_8v avg, 3, 4
566 do_8tap_8v avg, 4, 3
567
568
569 // Instantiate a vertical filter function for filtering a 4 pixels wide
570 // slice. The first half of the registers contain one row, while the second
571 // half of a register contains the second-next row (also stored in the first
572 // half of the register two steps ahead). The convolution does two outputs
573 // at a time; the output of v17-v24 into one, and v18-v25 into another one.
574 // The first half of first output is the first output row, the first half
575 // of the other output is the second output row. The second halves of the
576 // registers are rows 3 and 4.
577 // This only is designed to work for 4 or 8 output lines.
578 .macro do_8tap_4v type, idx1, idx2
579 function \type\()_8tap_4v_\idx1\idx2
580 sub x2, x2, x3, lsl #1
581 sub x2, x2, x3
582 ld1 {v0.8b}, [x6]
583 sxtl v0.8h, v0.8b
584 .ifc \type,avg
585 mov x7, x0
586 .endif
587
588 ld1 {v1.s}[0], [x2], x3
589 ld1 {v2.s}[0], [x2], x3
590 ld1 {v3.s}[0], [x2], x3
591 ld1 {v4.s}[0], [x2], x3
592 ld1 {v5.s}[0], [x2], x3
593 ld1 {v6.s}[0], [x2], x3
594 trn1 v1.2s, v1.2s, v3.2s
595 ld1 {v7.s}[0], [x2], x3
596 trn1 v2.2s, v2.2s, v4.2s
597 ld1 {v26.s}[0], [x2], x3
598 uxtl v17.8h, v1.8b
599 trn1 v3.2s, v3.2s, v5.2s
600 ld1 {v27.s}[0], [x2], x3
601 uxtl v18.8h, v2.8b
602 trn1 v4.2s, v4.2s, v6.2s
603 ld1 {v28.s}[0], [x2], x3
604 uxtl v19.8h, v3.8b
605 trn1 v5.2s, v5.2s, v7.2s
606 ld1 {v29.s}[0], [x2], x3
607 uxtl v20.8h, v4.8b
608 trn1 v6.2s, v6.2s, v26.2s
609 uxtl v21.8h, v5.8b
610 trn1 v7.2s, v7.2s, v27.2s
611 uxtl v22.8h, v6.8b
612 trn1 v26.2s, v26.2s, v28.2s
613 uxtl v23.8h, v7.8b
614 trn1 v27.2s, v27.2s, v29.2s
615 uxtl v24.8h, v26.8b
616 uxtl v25.8h, v27.8b
617
618 convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4
619 do_store4 v1, v2, v5, v6, \type
620
621 subs x4, x4, #4
622 b.eq 9f
623
624 ld1 {v1.s}[0], [x2], x3
625 ld1 {v2.s}[0], [x2], x3
626 trn1 v28.2s, v28.2s, v1.2s
627 trn1 v29.2s, v29.2s, v2.2s
628 ld1 {v1.s}[1], [x2], x3
629 uxtl v26.8h, v28.8b
630 ld1 {v2.s}[1], [x2], x3
631 uxtl v27.8h, v29.8b
632 uxtl v28.8h, v1.8b
633 uxtl v29.8h, v2.8b
634
635 convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4
636 do_store4 v1, v2, v5, v6, \type
637
638 9:
639 ret
640 endfunc
641 .endm
642
643 do_8tap_4v put, 3, 4
644 do_8tap_4v put, 4, 3
645 do_8tap_4v avg, 3, 4
646 do_8tap_4v avg, 4, 3
647
648
649 .macro do_8tap_v_func type, filter, offset, size
650 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
651 uxtw x4, w4
652 movrel x5, X(ff_vp9_subpel_filters), 120*\offset - 8
653 cmp w6, #8
654 add x6, x5, w6, uxtw #3
655 mov x5, #\size
656 .if \size >= 8
657 b.ge \type\()_8tap_8v_34
658 b \type\()_8tap_8v_43
659 .else
660 b.ge \type\()_8tap_4v_34
661 b \type\()_8tap_4v_43
662 .endif
663 endfunc
664 .endm
665
666 .macro do_8tap_v_filters size
667 do_8tap_v_func put, regular, 1, \size
668 do_8tap_v_func avg, regular, 1, \size
669 do_8tap_v_func put, sharp, 2, \size
670 do_8tap_v_func avg, sharp, 2, \size
671 do_8tap_v_func put, smooth, 0, \size
672 do_8tap_v_func avg, smooth, 0, \size
673 .endm
674
675 do_8tap_v_filters 64
676 do_8tap_v_filters 32
677 do_8tap_v_filters 16
678 do_8tap_v_filters 8
679 do_8tap_v_filters 4