fbf2901f75052d35e770d94a6e9fa765ebca12b3
[libav.git] / libavcodec / arm / vp9lpf_neon.S
1 /*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/arm/asm.S"
22 #include "neon.S"
23
24 @ Do an 8x8 transpose, using q registers for the subtransposes that don't
25 @ need to address the indiviudal d registers.
26 @ r0,r1 == rq0, r2,r3 == rq1, etc
27 .macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7
28 vtrn.32 \rq0, \rq2
29 vtrn.32 \rq1, \rq3
30 vtrn.16 \rq0, \rq1
31 vtrn.16 \rq2, \rq3
32 vtrn.8 \r0, \r1
33 vtrn.8 \r2, \r3
34 vtrn.8 \r4, \r5
35 vtrn.8 \r6, \r7
36 .endm
37
38 @ Do a 4x4 transpose, using q registers for the subtransposes that don't
39 @ need to address the indiviudal d registers.
40 @ r0,r1 == rq0, r2,r3 == rq1
41 .macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3
42 vtrn.16 \rq0, \rq1
43 vtrn.8 \r0, \r1
44 vtrn.8 \r2, \r3
45 .endm
46
47 @ The input to and output from this macro is in the registers d16-d31,
48 @ and d0-d7 are used as scratch registers.
49 @ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
50 @ Depending on the width of the loop filter, we either use d16-d19
51 @ and d28-d31 as temp registers, or d8-d15.
52 @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
53 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
54 vdup.u16 q0, r2 @ E
55 vdup.u8 d2, r3 @ I
56 ldr r3, [sp]
57
58 vabd.u8 d4, d20, d21 @ abs(p3 - p2)
59 vabd.u8 d5, d21, d22 @ abs(p2 - p1)
60 vabd.u8 d6, d22, d23 @ abs(p1 - p0)
61 vabd.u8 d7, d24, d25 @ abs(q0 - q1)
62 vabd.u8 \tmp1, d25, d26 @ abs(q1 - q2)
63 vabd.u8 \tmp2, d26, d27 @ abs(q2 - q3)
64 vmax.u8 d4, d4, d5
65 vmax.u8 d5, d6, d7
66 vmax.u8 \tmp1, \tmp1, \tmp2
67 vabdl.u8 q3, d23, d24 @ abs(p0 - q0)
68 vmax.u8 d4, d4, d5
69 vadd.u16 q3, q3, q3 @ abs(p0 - q0) * 2
70 vabd.u8 d5, d22, d25 @ abs(p1 - q1)
71 vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
72 vshr.u8 d5, d5, #1
73 vcle.u8 d4, d4, d2 @ max(abs()) <= I
74 vaddw.u8 q3, q3, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
75 vcle.u16 q3, q3, q0
76 vmovn.u16 d5, q3
77 vand d4, d4, d5 @ fm
78
79 vdup.u8 d3, r3 @ H
80 vmov r2, r3, d4
81 orr r2, r2, r3
82 cmp r2, #0
83 @ If no pixels need filtering, just exit as soon as possible
84 beq 9f
85
86 .if \wd >= 8
87 vmov.u8 d0, #1
88
89 vabd.u8 d6, d20, d23 @ abs(p3 - p0)
90 vabd.u8 d2, d21, d23 @ abs(p2 - p0)
91 vabd.u8 d1, d22, d23 @ abs(p1 - p0)
92 vabd.u8 \tmp1, d25, d24 @ abs(q1 - q0)
93 vabd.u8 \tmp2, d26, d24 @ abs(q2 - q0)
94 vabd.u8 \tmp3, d27, d24 @ abs(q3 - q0)
95 vmax.u8 d6, d6, d2
96 vmax.u8 d1, d1, \tmp1
97 vmax.u8 \tmp2, \tmp2, \tmp3
98 .if \wd == 16
99 vabd.u8 d7, d16, d23 @ abs(p7 - p0)
100 vmax.u8 d6, d6, d1
101 vabd.u8 d2, d17, d23 @ abs(p6 - p0)
102 vmax.u8 d6, d6, \tmp2
103 vabd.u8 d1, d18, d23 @ abs(p5 - p0)
104 vcle.u8 d6, d6, d0 @ flat8in
105 vabd.u8 d8, d19, d23 @ abs(p4 - p0)
106 vand d6, d6, d4 @ flat8in && fm
107 vabd.u8 d9, d28, d24 @ abs(q4 - q0)
108 vbic d4, d4, d6 @ fm && !flat8in
109 vabd.u8 d10, d29, d24 @ abs(q5 - q0)
110 vabd.u8 d11, d30, d24 @ abs(q6 - q0)
111 vabd.u8 d12, d31, d24 @ abs(q7 - q0)
112
113 vmax.u8 d7, d7, d2
114 vmax.u8 d1, d1, d8
115 vmax.u8 d9, d9, d10
116 vmax.u8 d11, d11, d12
117 @ The rest of the calculation of flat8out is interleaved below
118 .else
119 @ The rest of the calculation of flat8in is interleaved below
120 .endif
121 .endif
122
123 @ Calculate the normal inner loop filter for 2 or 4 pixels
124 vabd.u8 d5, d22, d23 @ abs(p1 - p0)
125 .if \wd == 16
126 vmax.u8 d7, d7, d1
127 vmax.u8 d9, d9, d11
128 .elseif \wd == 8
129 vmax.u8 d6, d6, d1
130 .endif
131 vabd.u8 d1, d25, d24 @ abs(q1 - q0)
132 .if \wd == 16
133 vmax.u8 d7, d7, d9
134 .elseif \wd == 8
135 vmax.u8 d6, d6, \tmp2
136 .endif
137 vsubl.u8 \tmpq1, d22, d25 @ p1 - q1
138 vmax.u8 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0))
139 vsubl.u8 \tmpq2, d24, d23 @ q0 - p0
140 vmov.s16 \tmpq3, #3
141 .if \wd == 8
142 vcle.u8 d6, d6, d0 @ flat8in
143 .endif
144 vcgt.u8 d5, d5, d3 @ hev
145 .if \wd == 8
146 vand d6, d6, d4 @ flat8in && fm
147 .endif
148 vqmovn.s16 \tmp1, \tmpq1 @ av_clip_int8(p1 - q1)
149 .if \wd == 16
150 vcle.u8 d7, d7, d0 @ flat8out
151 .elseif \wd == 8
152 vbic d4, d4, d6 @ fm && !flat8in
153 .endif
154 vmvn d5, d5 @ !hev
155 .if \wd == 16
156 vand d7, d7, d6 @ flat8out && flat8in && fm
157 .endif
158 vand d5, d5, d4 @ !hev && fm && !flat8in
159
160 vmul.s16 \tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0)
161 vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int8 = 0
162 vmov.s8 d2, #4
163 vaddw.s8 \tmpq2, \tmpq2, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
164 vmov.s8 d3, #3
165 vqmovn.s16 \tmp1, \tmpq2 @ f
166 .if \wd == 16
167 vbic d6, d6, d7 @ fm && flat8in && !flat8out
168 .endif
169
170 vqadd.s8 \tmp3, \tmp1, d2 @ FFMIN(f + 4, 127)
171 vqadd.s8 \tmp4, \tmp1, d3 @ FFMIN(f + 3, 127)
172 vmovl.u8 q0, d23 @ p0
173 vshr.s8 \tmp3, \tmp3, #3 @ f1
174 vshr.s8 \tmp4, \tmp4, #3 @ f2
175
176 vmovl.u8 q1, d24 @ q0
177 vaddw.s8 q0, q0, \tmp4 @ p0 + f2
178 vsubw.s8 q1, q1, \tmp3 @ q0 - f1
179 vqmovun.s16 d0, q0 @ out p0
180 vqmovun.s16 d1, q1 @ out q0
181 vrshr.s8 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1
182 vbit d23, d0, d4 @ if (fm && !flat8in)
183 vbit d24, d1, d4
184
185 vmovl.u8 q0, d22 @ p1
186 vmovl.u8 q1, d25 @ q1
187 vaddw.s8 q0, q0, \tmp3 @ p1 + f
188 vsubw.s8 q1, q1, \tmp3 @ q1 - f
189 vqmovun.s16 d0, q0 @ out p1
190 vqmovun.s16 d2, q1 @ out q1
191 vbit d22, d0, d5 @ if (!hev && fm && !flat8in)
192 vbit d25, d2, d5
193
194 .if \wd >= 8
195 vmov r2, r3, d6
196 orr r2, r2, r3
197 cmp r2, #0
198 @ If no pixels need flat8in, jump to flat8out
199 @ (or to a writeout of the inner 4 pixels, for wd=8)
200 beq 6f
201
202 @ flat8in
203 vaddl.u8 \tmpq1, d20, d21
204 vaddl.u8 \tmpq2, d22, d25
205 vaddl.u8 \tmpq3, d20, d22
206 vaddl.u8 \tmpq4, d23, d26
207 vadd.u16 q0, \tmpq1, \tmpq1
208 vaddw.u8 q0, q0, d23
209 vaddw.u8 q0, q0, d24
210 vadd.u16 q0, q0, \tmpq3
211 vsub.s16 \tmpq2, \tmpq2, \tmpq1
212 vsub.s16 \tmpq4, \tmpq4, \tmpq3
213 vrshrn.u16 d2, q0, #3 @ out p2
214
215 vadd.u16 q0, q0, \tmpq2
216 vaddl.u8 \tmpq1, d20, d23
217 vaddl.u8 \tmpq2, d24, d27
218 vrshrn.u16 d3, q0, #3 @ out p1
219
220 vadd.u16 q0, q0, \tmpq4
221 vsub.s16 \tmpq2, \tmpq2, \tmpq1
222 vaddl.u8 \tmpq3, d21, d24
223 vaddl.u8 \tmpq4, d25, d27
224 vrshrn.u16 d4, q0, #3 @ out p0
225
226 vadd.u16 q0, q0, \tmpq2
227 vsub.s16 \tmpq4, \tmpq4, \tmpq3
228 vaddl.u8 \tmpq1, d22, d25
229 vaddl.u8 \tmpq2, d26, d27
230 vrshrn.u16 d5, q0, #3 @ out q0
231
232 vadd.u16 q0, q0, \tmpq4
233 vsub.s16 \tmpq2, \tmpq2, \tmpq1
234 vrshrn.u16 \tmp5, q0, #3 @ out q1
235
236 vadd.u16 q0, q0, \tmpq2
237 @ The output here is written back into the input registers. This doesn't
238 @ matter for the flat8out part below, since we only update those pixels
239 @ which won't be touched below.
240 vbit d21, d2, d6
241 vbit d22, d3, d6
242 vbit d23, d4, d6
243 vrshrn.u16 \tmp6, q0, #3 @ out q2
244 vbit d24, d5, d6
245 vbit d25, \tmp5, d6
246 vbit d26, \tmp6, d6
247 .endif
248 .if \wd == 16
249 6:
250 vorr d2, d6, d7
251 vmov r2, r3, d2
252 orr r2, r2, r3
253 cmp r2, #0
254 @ If no pixels needed flat8in nor flat8out, jump to a
255 @ writeout of the inner 4 pixels
256 beq 7f
257 vmov r2, r3, d7
258 orr r2, r2, r3
259 cmp r2, #0
260 @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
261 beq 8f
262
263 @ flat8out
264 @ This writes all outputs into d2-d17 (skipping d6 and d16).
265 @ If this part is skipped, the output is read from d21-d26 (which is the input
266 @ to this section).
267 vshll.u8 q0, d16, #3 @ 8 * d16
268 vsubw.u8 q0, q0, d16 @ 7 * d16
269 vaddw.u8 q0, q0, d17
270 vaddl.u8 q4, d17, d18
271 vaddl.u8 q5, d19, d20
272 vadd.s16 q0, q0, q4
273 vaddl.u8 q4, d16, d17
274 vaddl.u8 q6, d21, d22
275 vadd.s16 q0, q0, q5
276 vaddl.u8 q5, d18, d25
277 vaddl.u8 q7, d23, d24
278 vsub.s16 q5, q5, q4
279 vadd.s16 q0, q0, q6
280 vadd.s16 q0, q0, q7
281 vaddl.u8 q6, d16, d18
282 vaddl.u8 q7, d19, d26
283 vrshrn.u16 d2, q0, #4
284
285 vadd.s16 q0, q0, q5
286 vaddl.u8 q4, d16, d19
287 vaddl.u8 q5, d20, d27
288 vsub.s16 q7, q7, q6
289 vbif d2, d17, d7
290 vrshrn.u16 d3, q0, #4
291
292 vadd.s16 q0, q0, q7
293 vaddl.u8 q6, d16, d20
294 vaddl.u8 q7, d21, d28
295 vsub.s16 q5, q5, q4
296 vbif d3, d18, d7
297 vrshrn.u16 d4, q0, #4
298
299 vadd.s16 q0, q0, q5
300 vaddl.u8 q4, d16, d21
301 vaddl.u8 q5, d22, d29
302 vsub.s16 q7, q7, q6
303 vbif d4, d19, d7
304 vrshrn.u16 d5, q0, #4
305
306 vadd.s16 q0, q0, q7
307 vaddl.u8 q6, d16, d22
308 vaddl.u8 q7, d23, d30
309 vsub.s16 q5, q5, q4
310 vbif d5, d20, d7
311 vrshrn.u16 d6, q0, #4
312
313 vadd.s16 q0, q0, q5
314 vaddl.u8 q5, d16, d23
315 vsub.s16 q7, q7, q6
316 vaddl.u8 q6, d24, d31
317 vbif d6, d21, d7
318 vrshrn.u16 d8, q0, #4
319
320 vadd.s16 q0, q0, q7
321 vsub.s16 q5, q6, q5
322 vaddl.u8 q6, d17, d24
323 vaddl.u8 q7, d25, d31
324 vbif d8, d22, d7
325 vrshrn.u16 d9, q0, #4
326
327 vadd.s16 q0, q0, q5
328 vsub.s16 q7, q7, q6
329 vaddl.u8 q6, d26, d31
330 vbif d9, d23, d7
331 vrshrn.u16 d10, q0, #4
332
333 vadd.s16 q0, q0, q7
334 vaddl.u8 q7, d18, d25
335 vaddl.u8 q9, d19, d26
336 vsub.s16 q6, q6, q7
337 vaddl.u8 q7, d27, d31
338 vbif d10, d24, d7
339 vrshrn.u16 d11, q0, #4
340
341 vadd.s16 q0, q0, q6
342 vaddl.u8 q6, d20, d27
343 vsub.s16 q7, q7, q9
344 vaddl.u8 q9, d28, d31
345 vbif d11, d25, d7
346 vsub.s16 q9, q9, q6
347 vrshrn.u16 d12, q0, #4
348
349 vadd.s16 q0, q0, q7
350 vaddl.u8 q7, d21, d28
351 vaddl.u8 q10, d29, d31
352 vbif d12, d26, d7
353 vrshrn.u16 d13, q0, #4
354
355 vadd.s16 q0, q0, q9
356 vsub.s16 q10, q10, q7
357 vaddl.u8 q9, d22, d29
358 vaddl.u8 q11, d30, d31
359 vbif d13, d27, d7
360 vrshrn.u16 d14, q0, #4
361
362 vadd.s16 q0, q0, q10
363 vsub.s16 q11, q11, q9
364 vbif d14, d28, d7
365 vrshrn.u16 d15, q0, #4
366
367 vadd.s16 q0, q0, q11
368 vbif d15, d29, d7
369 vrshrn.u16 d17, q0, #4
370 vbif d17, d30, d7
371 .endif
372 .endm
373
374 @ For wd <= 8, we use d16-d19 and d28-d31 for temp registers,
375 @ while we need those for inputs/outputs in wd=16 and use d8-d15
376 @ for temp registers there instead.
377 .macro loop_filter_4
378 loop_filter 4, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
379 .endm
380
381 .macro loop_filter_8
382 loop_filter 8, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
383 .endm
384
385 .macro loop_filter_16
386 loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15, q4, q5, q6, q7
387 .endm
388
389
390 @ The public functions in this file have got the following signature:
391 @ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
392
393 function ff_vp9_loop_filter_v_4_8_neon, export=1
394 sub r12, r0, r1, lsl #2
395 vld1.8 {d20}, [r12,:64], r1 @ p3
396 vld1.8 {d24}, [r0, :64], r1 @ q0
397 vld1.8 {d21}, [r12,:64], r1 @ p2
398 vld1.8 {d25}, [r0, :64], r1 @ q1
399 vld1.8 {d22}, [r12,:64], r1 @ p1
400 vld1.8 {d26}, [r0, :64], r1 @ q2
401 vld1.8 {d23}, [r12,:64], r1 @ p0
402 vld1.8 {d27}, [r0, :64], r1 @ q3
403 sub r0, r0, r1, lsl #2
404 sub r12, r12, r1, lsl #1
405
406 loop_filter_4
407
408 vst1.8 {d22}, [r12,:64], r1
409 vst1.8 {d24}, [r0, :64], r1
410 vst1.8 {d23}, [r12,:64], r1
411 vst1.8 {d25}, [r0, :64], r1
412 9:
413 bx lr
414 endfunc
415
416 function ff_vp9_loop_filter_h_4_8_neon, export=1
417 sub r12, r0, #4
418 add r0, r12, r1, lsl #2
419 vld1.8 {d20}, [r12], r1
420 vld1.8 {d24}, [r0], r1
421 vld1.8 {d21}, [r12], r1
422 vld1.8 {d25}, [r0], r1
423 vld1.8 {d22}, [r12], r1
424 vld1.8 {d26}, [r0], r1
425 vld1.8 {d23}, [r12], r1
426 vld1.8 {d27}, [r0], r1
427
428 sub r12, r12, r1, lsl #2
429 sub r0, r0, r1, lsl #2
430 @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
431 @ outermost 2 pixels since they aren't changed.
432 add r12, r12, #2
433 add r0, r0, #2
434
435 @ Transpose the 8x8 pixels, taking advantage of q registers, to get
436 @ one register per column.
437 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
438
439 loop_filter_4
440
441 @ We only will write the mid 4 pixels back; after the loop filter,
442 @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows
443 @ (8x4 pixels). We need to transpose them to columns, done with a
444 @ 4x4 transpose (which in practice is two 4x4 transposes of the two
445 @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
446 transpose_q_4x4 q11, q12, d22, d23, d24, d25
447
448 vst1.32 {d22[0]}, [r12], r1
449 vst1.32 {d22[1]}, [r0], r1
450 vst1.32 {d23[0]}, [r12], r1
451 vst1.32 {d23[1]}, [r0], r1
452 vst1.32 {d24[0]}, [r12], r1
453 vst1.32 {d24[1]}, [r0], r1
454 vst1.32 {d25[0]}, [r12], r1
455 vst1.32 {d25[1]}, [r0], r1
456 9:
457 bx lr
458 endfunc
459
460 function ff_vp9_loop_filter_v_8_8_neon, export=1
461 sub r12, r0, r1, lsl #2
462 vld1.8 {d20}, [r12,:64], r1 @ p3
463 vld1.8 {d24}, [r0, :64], r1 @ q0
464 vld1.8 {d21}, [r12,:64], r1 @ p2
465 vld1.8 {d25}, [r0, :64], r1 @ q1
466 vld1.8 {d22}, [r12,:64], r1 @ p1
467 vld1.8 {d26}, [r0, :64], r1 @ q2
468 vld1.8 {d23}, [r12,:64], r1 @ p0
469 vld1.8 {d27}, [r0, :64], r1 @ q3
470 sub r12, r12, r1, lsl #2
471 sub r0, r0, r1, lsl #2
472 add r12, r12, r1
473
474 loop_filter_8
475
476 vst1.8 {d21}, [r12,:64], r1
477 vst1.8 {d24}, [r0, :64], r1
478 vst1.8 {d22}, [r12,:64], r1
479 vst1.8 {d25}, [r0, :64], r1
480 vst1.8 {d23}, [r12,:64], r1
481 vst1.8 {d26}, [r0, :64], r1
482 9:
483 bx lr
484 6:
485 sub r12, r0, r1, lsl #1
486 vst1.8 {d22}, [r12,:64], r1
487 vst1.8 {d24}, [r0, :64], r1
488 vst1.8 {d23}, [r12,:64], r1
489 vst1.8 {d25}, [r0, :64], r1
490 bx lr
491 endfunc
492
493 function ff_vp9_loop_filter_h_8_8_neon, export=1
494 sub r12, r0, #4
495 add r0, r12, r1, lsl #2
496 vld1.8 {d20}, [r12], r1
497 vld1.8 {d24}, [r0], r1
498 vld1.8 {d21}, [r12], r1
499 vld1.8 {d25}, [r0], r1
500 vld1.8 {d22}, [r12], r1
501 vld1.8 {d26}, [r0], r1
502 vld1.8 {d23}, [r12], r1
503 vld1.8 {d27}, [r0], r1
504
505 sub r12, r12, r1, lsl #2
506 sub r0, r0, r1, lsl #2
507
508 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
509
510 loop_filter_8
511
512 @ Even though only 6 pixels per row have been changed, we write the
513 @ full 8 pixel registers.
514 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
515
516 vst1.8 {d20}, [r12], r1
517 vst1.8 {d24}, [r0], r1
518 vst1.8 {d21}, [r12], r1
519 vst1.8 {d25}, [r0], r1
520 vst1.8 {d22}, [r12], r1
521 vst1.8 {d26}, [r0], r1
522 vst1.8 {d23}, [r12], r1
523 vst1.8 {d27}, [r0], r1
524 9:
525 bx lr
526 6:
527 @ If we didn't need to do the flat8in part, we use the same writeback
528 @ as in loop_filter_h_4_8.
529 add r12, r12, #2
530 add r0, r0, #2
531 transpose_q_4x4 q11, q12, d22, d23, d24, d25
532 vst1.32 {d22[0]}, [r12], r1
533 vst1.32 {d22[1]}, [r0], r1
534 vst1.32 {d23[0]}, [r12], r1
535 vst1.32 {d23[1]}, [r0], r1
536 vst1.32 {d24[0]}, [r12], r1
537 vst1.32 {d24[1]}, [r0], r1
538 vst1.32 {d25[0]}, [r12], r1
539 vst1.32 {d25[1]}, [r0], r1
540 bx lr
541 endfunc
542
543 function vp9_loop_filter_v_16_neon
544 sub r12, r0, r1, lsl #3
545 @ Read p7-p0 using r12 and q0-q7 using r0
546 vld1.8 {d16}, [r12,:64], r1 @ p7
547 vld1.8 {d24}, [r0, :64], r1 @ q0
548 vld1.8 {d17}, [r12,:64], r1 @ p6
549 vld1.8 {d25}, [r0, :64], r1 @ q1
550 vld1.8 {d18}, [r12,:64], r1 @ p5
551 vld1.8 {d26}, [r0, :64], r1 @ q2
552 vld1.8 {d19}, [r12,:64], r1 @ p4
553 vld1.8 {d27}, [r0, :64], r1 @ q3
554 vld1.8 {d20}, [r12,:64], r1 @ p3
555 vld1.8 {d28}, [r0, :64], r1 @ q4
556 vld1.8 {d21}, [r12,:64], r1 @ p2
557 vld1.8 {d29}, [r0, :64], r1 @ q5
558 vld1.8 {d22}, [r12,:64], r1 @ p1
559 vld1.8 {d30}, [r0, :64], r1 @ q6
560 vld1.8 {d23}, [r12,:64], r1 @ p0
561 vld1.8 {d31}, [r0, :64], r1 @ q7
562 sub r12, r12, r1, lsl #3
563 sub r0, r0, r1, lsl #3
564 add r12, r12, r1
565
566 loop_filter_16
567
568 @ If we did the flat8out part, we get the output in
569 @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
570 @ store d2-d9 there, and d10-d17 into r0.
571 vst1.8 {d2}, [r12,:64], r1
572 vst1.8 {d10}, [r0, :64], r1
573 vst1.8 {d3}, [r12,:64], r1
574 vst1.8 {d11}, [r0, :64], r1
575 vst1.8 {d4}, [r12,:64], r1
576 vst1.8 {d12}, [r0, :64], r1
577 vst1.8 {d5}, [r12,:64], r1
578 vst1.8 {d13}, [r0, :64], r1
579 vst1.8 {d6}, [r12,:64], r1
580 vst1.8 {d14}, [r0, :64], r1
581 vst1.8 {d8}, [r12,:64], r1
582 vst1.8 {d15}, [r0, :64], r1
583 vst1.8 {d9}, [r12,:64], r1
584 vst1.8 {d17}, [r0, :64], r1
585 sub r0, r0, r1, lsl #3
586 add r0, r0, r1
587
588 9:
589 bx lr
590
591 8:
592 add r12, r12, r1, lsl #2
593 @ If we didn't do the flat8out part, the output is left in the
594 @ input registers.
595 vst1.8 {d21}, [r12,:64], r1
596 vst1.8 {d24}, [r0, :64], r1
597 vst1.8 {d22}, [r12,:64], r1
598 vst1.8 {d25}, [r0, :64], r1
599 vst1.8 {d23}, [r12,:64], r1
600 vst1.8 {d26}, [r0, :64], r1
601 sub r0, r0, r1, lsl #1
602 sub r0, r0, r1
603 bx lr
604 7:
605 sub r12, r0, r1, lsl #1
606 vst1.8 {d22}, [r12,:64], r1
607 vst1.8 {d24}, [r0, :64], r1
608 vst1.8 {d23}, [r12,:64], r1
609 vst1.8 {d25}, [r0, :64], r1
610 sub r0, r0, r1, lsl #1
611 bx lr
612 endfunc
613
614 function ff_vp9_loop_filter_v_16_8_neon, export=1
615 ldr r12, [sp]
616 push {lr}
617 vpush {q4-q7}
618 push {r12}
619 bl vp9_loop_filter_v_16_neon
620 add sp, sp, #4
621 vpop {q4-q7}
622 pop {pc}
623 endfunc
624
625 function ff_vp9_loop_filter_v_16_16_neon, export=1
626 ldr r12, [sp]
627 // The filter clobbers r2 and r3, but we need to keep them for the second round
628 push {r2, r3, lr}
629 vpush {q4-q7}
630 push {r12}
631 bl vp9_loop_filter_v_16_neon
632 add r0, #8
633 ldr r2, [sp, #68]
634 ldr r3, [sp, #72]
635 bl vp9_loop_filter_v_16_neon
636 add sp, sp, #4
637 vpop {q4-q7}
638 pop {r2, r3, pc}
639 endfunc
640
641 function vp9_loop_filter_h_16_neon
642 sub r12, r0, #8
643 vld1.8 {d16}, [r12,:64], r1
644 vld1.8 {d24}, [r0, :64], r1
645 vld1.8 {d17}, [r12,:64], r1
646 vld1.8 {d25}, [r0, :64], r1
647 vld1.8 {d18}, [r12,:64], r1
648 vld1.8 {d26}, [r0, :64], r1
649 vld1.8 {d19}, [r12,:64], r1
650 vld1.8 {d27}, [r0, :64], r1
651 vld1.8 {d20}, [r12,:64], r1
652 vld1.8 {d28}, [r0, :64], r1
653 vld1.8 {d21}, [r12,:64], r1
654 vld1.8 {d29}, [r0, :64], r1
655 vld1.8 {d22}, [r12,:64], r1
656 vld1.8 {d30}, [r0, :64], r1
657 vld1.8 {d23}, [r12,:64], r1
658 vld1.8 {d31}, [r0, :64], r1
659 sub r0, r0, r1, lsl #3
660 sub r12, r12, r1, lsl #3
661
662 @ The 16x8 pixels read above is in two 8x8 blocks; the left
663 @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes
664 @ of this, to get one column per register. This could be done with two
665 @ transpose_8x8 as below, but this takes advantage of the q registers.
666 transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
667 vtrn.8 d16, d17
668 vtrn.8 d18, d19
669 vtrn.8 d20, d21
670 vtrn.8 d22, d23
671 vtrn.8 d24, d25
672 vtrn.8 d26, d27
673 vtrn.8 d28, d29
674 vtrn.8 d30, d31
675
676 loop_filter_16
677
678 @ Transpose back; this is the same transpose as above, but
679 @ we can't take advantage of q registers for the transpose, since
680 @ all d registers in the transpose aren't consecutive.
681 transpose_8x8 d16, d2, d3, d4, d5, d6, d8, d9
682 transpose_8x8 d10, d11, d12, d13, d14, d15, d17, d31
683
684 vst1.8 {d16}, [r12,:64], r1
685 vst1.8 {d10}, [r0, :64], r1
686
687 vst1.8 {d2}, [r12,:64], r1
688 vst1.8 {d11}, [r0, :64], r1
689
690 vst1.8 {d3}, [r12,:64], r1
691 vst1.8 {d12}, [r0, :64], r1
692
693 vst1.8 {d4}, [r12,:64], r1
694 vst1.8 {d13}, [r0, :64], r1
695
696 vst1.8 {d5}, [r12,:64], r1
697 vst1.8 {d14}, [r0, :64], r1
698
699 vst1.8 {d6}, [r12,:64], r1
700 vst1.8 {d15}, [r0, :64], r1
701
702 vst1.8 {d8}, [r12,:64], r1
703 vst1.8 {d17}, [r0, :64], r1
704
705 vst1.8 {d9}, [r12,:64], r1
706 vst1.8 {d31}, [r0, :64], r1
707 sub r0, r0, r1, lsl #3
708 9:
709 bx lr
710 8:
711 @ The same writeback as in loop_filter_h_8_8
712 sub r12, r0, #4
713 add r0, r12, r1, lsl #2
714 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
715
716 vst1.8 {d20}, [r12], r1
717 vst1.8 {d24}, [r0], r1
718 vst1.8 {d21}, [r12], r1
719 vst1.8 {d25}, [r0], r1
720 vst1.8 {d22}, [r12], r1
721 vst1.8 {d26}, [r0], r1
722 vst1.8 {d23}, [r12], r1
723 vst1.8 {d27}, [r0], r1
724 sub r0, r0, r1, lsl #3
725 add r0, r0, #4
726 bx lr
727 7:
728 @ The same writeback as in loop_filter_h_4_8
729 sub r12, r0, #2
730 add r0, r12, r1, lsl #2
731 transpose_q_4x4 q11, q12, d22, d23, d24, d25
732 vst1.32 {d22[0]}, [r12], r1
733 vst1.32 {d22[1]}, [r0], r1
734 vst1.32 {d23[0]}, [r12], r1
735 vst1.32 {d23[1]}, [r0], r1
736 vst1.32 {d24[0]}, [r12], r1
737 vst1.32 {d24[1]}, [r0], r1
738 vst1.32 {d25[0]}, [r12], r1
739 vst1.32 {d25[1]}, [r0], r1
740 sub r0, r0, r1, lsl #3
741 add r0, r0, #2
742 bx lr
743 endfunc
744
745 function ff_vp9_loop_filter_h_16_8_neon, export=1
746 ldr r12, [sp]
747 push {lr}
748 vpush {q4-q7}
749 push {r12}
750 bl vp9_loop_filter_h_16_neon
751 add sp, sp, #4
752 vpop {q4-q7}
753 pop {pc}
754 endfunc
755
756 function ff_vp9_loop_filter_h_16_16_neon, export=1
757 ldr r12, [sp]
758 // The filter clobbers r2 and r3, but we need to keep them for the second round
759 push {r2, r3, lr}
760 vpush {q4-q7}
761 push {r12}
762 bl vp9_loop_filter_h_16_neon
763 add r0, r0, r1, lsl #3
764 ldr r2, [sp, #68]
765 ldr r3, [sp, #72]
766 bl vp9_loop_filter_h_16_neon
767 add sp, sp, #4
768 vpop {q4-q7}
769 pop {r2, r3, pc}
770 endfunc