arm/aarch64: vp9lpf: Calculate !hev directly
[libav.git] / libavcodec / arm / vp9lpf_neon.S
CommitLineData
dd299a2d
MS
1/*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22#include "neon.S"
23
24@ Do an 8x8 transpose, using q registers for the subtransposes that don't
25@ need to address the indiviudal d registers.
26@ r0,r1 == rq0, r2,r3 == rq1, etc
27.macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7
28 vtrn.32 \rq0, \rq2
29 vtrn.32 \rq1, \rq3
30 vtrn.16 \rq0, \rq1
31 vtrn.16 \rq2, \rq3
32 vtrn.8 \r0, \r1
33 vtrn.8 \r2, \r3
34 vtrn.8 \r4, \r5
35 vtrn.8 \r6, \r7
36.endm
37
38@ Do a 4x4 transpose, using q registers for the subtransposes that don't
39@ need to address the indiviudal d registers.
40@ r0,r1 == rq0, r2,r3 == rq1
41.macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3
42 vtrn.16 \rq0, \rq1
43 vtrn.8 \r0, \r1
44 vtrn.8 \r2, \r3
45.endm
46
47@ The input to and output from this macro is in the registers d16-d31,
48@ and d0-d7 are used as scratch registers.
49@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
50@ Depending on the width of the loop filter, we either use d16-d19
51@ and d28-d31 as temp registers, or d8-d15.
52@ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
53.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
54 vdup.u16 q0, r2 @ E
55 vdup.u8 d2, r3 @ I
56 ldr r3, [sp]
57
58 vabd.u8 d4, d20, d21 @ abs(p3 - p2)
59 vabd.u8 d5, d21, d22 @ abs(p2 - p1)
60 vabd.u8 d6, d22, d23 @ abs(p1 - p0)
61 vabd.u8 d7, d24, d25 @ abs(q0 - q1)
62 vabd.u8 \tmp1, d25, d26 @ abs(q1 - q2)
63 vabd.u8 \tmp2, d26, d27 @ abs(q2 - q3)
64 vmax.u8 d4, d4, d5
65 vmax.u8 d5, d6, d7
66 vmax.u8 \tmp1, \tmp1, \tmp2
67 vabdl.u8 q3, d23, d24 @ abs(p0 - q0)
68 vmax.u8 d4, d4, d5
69 vadd.u16 q3, q3, q3 @ abs(p0 - q0) * 2
70 vabd.u8 d5, d22, d25 @ abs(p1 - q1)
71 vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
72 vshr.u8 d5, d5, #1
73 vcle.u8 d4, d4, d2 @ max(abs()) <= I
74 vaddw.u8 q3, q3, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
75 vcle.u16 q3, q3, q0
76 vmovn.u16 d5, q3
77 vand d4, d4, d5 @ fm
78
79 vdup.u8 d3, r3 @ H
80 vmov r2, r3, d4
81 orr r2, r2, r3
82 cmp r2, #0
83 @ If no pixels need filtering, just exit as soon as possible
84 beq 9f
85
86.if \wd >= 8
87 vmov.u8 d0, #1
88
89 vabd.u8 d6, d20, d23 @ abs(p3 - p0)
90 vabd.u8 d2, d21, d23 @ abs(p2 - p0)
91 vabd.u8 d1, d22, d23 @ abs(p1 - p0)
92 vabd.u8 \tmp1, d25, d24 @ abs(q1 - q0)
93 vabd.u8 \tmp2, d26, d24 @ abs(q2 - q0)
94 vabd.u8 \tmp3, d27, d24 @ abs(q3 - q0)
95 vmax.u8 d6, d6, d2
96 vmax.u8 d1, d1, \tmp1
97 vmax.u8 \tmp2, \tmp2, \tmp3
98.if \wd == 16
99 vabd.u8 d7, d16, d23 @ abs(p7 - p0)
100 vmax.u8 d6, d6, d1
101 vabd.u8 d2, d17, d23 @ abs(p6 - p0)
102 vmax.u8 d6, d6, \tmp2
103 vabd.u8 d1, d18, d23 @ abs(p5 - p0)
104 vcle.u8 d6, d6, d0 @ flat8in
105 vabd.u8 d8, d19, d23 @ abs(p4 - p0)
106 vand d6, d6, d4 @ flat8in && fm
107 vabd.u8 d9, d28, d24 @ abs(q4 - q0)
108 vbic d4, d4, d6 @ fm && !flat8in
109 vabd.u8 d10, d29, d24 @ abs(q5 - q0)
110 vabd.u8 d11, d30, d24 @ abs(q6 - q0)
111 vabd.u8 d12, d31, d24 @ abs(q7 - q0)
112
113 vmax.u8 d7, d7, d2
114 vmax.u8 d1, d1, d8
115 vmax.u8 d9, d9, d10
116 vmax.u8 d11, d11, d12
117 @ The rest of the calculation of flat8out is interleaved below
118.else
119 @ The rest of the calculation of flat8in is interleaved below
120.endif
121.endif
122
123 @ Calculate the normal inner loop filter for 2 or 4 pixels
124 vabd.u8 d5, d22, d23 @ abs(p1 - p0)
125.if \wd == 16
126 vmax.u8 d7, d7, d1
127 vmax.u8 d9, d9, d11
128.elseif \wd == 8
129 vmax.u8 d6, d6, d1
130.endif
131 vabd.u8 d1, d25, d24 @ abs(q1 - q0)
132.if \wd == 16
133 vmax.u8 d7, d7, d9
134.elseif \wd == 8
135 vmax.u8 d6, d6, \tmp2
136.endif
137 vsubl.u8 \tmpq1, d22, d25 @ p1 - q1
138 vmax.u8 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0))
139 vsubl.u8 \tmpq2, d24, d23 @ q0 - p0
140 vmov.s16 \tmpq3, #3
141.if \wd == 8
142 vcle.u8 d6, d6, d0 @ flat8in
143.endif
e1f9de86 144 vcle.u8 d5, d5, d3 @ !hev
dd299a2d
MS
145.if \wd == 8
146 vand d6, d6, d4 @ flat8in && fm
147.endif
148 vqmovn.s16 \tmp1, \tmpq1 @ av_clip_int8(p1 - q1)
149.if \wd == 16
150 vcle.u8 d7, d7, d0 @ flat8out
151.elseif \wd == 8
152 vbic d4, d4, d6 @ fm && !flat8in
153.endif
e1f9de86 154 vand d5, d5, d4 @ !hev && fm && !flat8in
dd299a2d
MS
155.if \wd == 16
156 vand d7, d7, d6 @ flat8out && flat8in && fm
157.endif
dd299a2d
MS
158
159 vmul.s16 \tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0)
160 vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int8 = 0
161 vmov.s8 d2, #4
162 vaddw.s8 \tmpq2, \tmpq2, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
163 vmov.s8 d3, #3
164 vqmovn.s16 \tmp1, \tmpq2 @ f
165.if \wd == 16
166 vbic d6, d6, d7 @ fm && flat8in && !flat8out
167.endif
168
169 vqadd.s8 \tmp3, \tmp1, d2 @ FFMIN(f + 4, 127)
170 vqadd.s8 \tmp4, \tmp1, d3 @ FFMIN(f + 3, 127)
171 vmovl.u8 q0, d23 @ p0
172 vshr.s8 \tmp3, \tmp3, #3 @ f1
173 vshr.s8 \tmp4, \tmp4, #3 @ f2
174
175 vmovl.u8 q1, d24 @ q0
176 vaddw.s8 q0, q0, \tmp4 @ p0 + f2
177 vsubw.s8 q1, q1, \tmp3 @ q0 - f1
178 vqmovun.s16 d0, q0 @ out p0
179 vqmovun.s16 d1, q1 @ out q0
180 vrshr.s8 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1
181 vbit d23, d0, d4 @ if (fm && !flat8in)
182 vbit d24, d1, d4
183
184 vmovl.u8 q0, d22 @ p1
185 vmovl.u8 q1, d25 @ q1
186 vaddw.s8 q0, q0, \tmp3 @ p1 + f
187 vsubw.s8 q1, q1, \tmp3 @ q1 - f
188 vqmovun.s16 d0, q0 @ out p1
189 vqmovun.s16 d2, q1 @ out q1
190 vbit d22, d0, d5 @ if (!hev && fm && !flat8in)
191 vbit d25, d2, d5
192
193.if \wd >= 8
194 vmov r2, r3, d6
195 orr r2, r2, r3
196 cmp r2, #0
197 @ If no pixels need flat8in, jump to flat8out
198 @ (or to a writeout of the inner 4 pixels, for wd=8)
199 beq 6f
200
201 @ flat8in
202 vaddl.u8 \tmpq1, d20, d21
203 vaddl.u8 \tmpq2, d22, d25
204 vaddl.u8 \tmpq3, d20, d22
205 vaddl.u8 \tmpq4, d23, d26
206 vadd.u16 q0, \tmpq1, \tmpq1
207 vaddw.u8 q0, q0, d23
208 vaddw.u8 q0, q0, d24
209 vadd.u16 q0, q0, \tmpq3
210 vsub.s16 \tmpq2, \tmpq2, \tmpq1
211 vsub.s16 \tmpq4, \tmpq4, \tmpq3
212 vrshrn.u16 d2, q0, #3 @ out p2
213
214 vadd.u16 q0, q0, \tmpq2
215 vaddl.u8 \tmpq1, d20, d23
216 vaddl.u8 \tmpq2, d24, d27
217 vrshrn.u16 d3, q0, #3 @ out p1
218
219 vadd.u16 q0, q0, \tmpq4
220 vsub.s16 \tmpq2, \tmpq2, \tmpq1
221 vaddl.u8 \tmpq3, d21, d24
222 vaddl.u8 \tmpq4, d25, d27
223 vrshrn.u16 d4, q0, #3 @ out p0
224
225 vadd.u16 q0, q0, \tmpq2
226 vsub.s16 \tmpq4, \tmpq4, \tmpq3
227 vaddl.u8 \tmpq1, d22, d25
228 vaddl.u8 \tmpq2, d26, d27
229 vrshrn.u16 d5, q0, #3 @ out q0
230
231 vadd.u16 q0, q0, \tmpq4
232 vsub.s16 \tmpq2, \tmpq2, \tmpq1
233 vrshrn.u16 \tmp5, q0, #3 @ out q1
234
235 vadd.u16 q0, q0, \tmpq2
236 @ The output here is written back into the input registers. This doesn't
237 @ matter for the flat8out part below, since we only update those pixels
238 @ which won't be touched below.
239 vbit d21, d2, d6
240 vbit d22, d3, d6
241 vbit d23, d4, d6
242 vrshrn.u16 \tmp6, q0, #3 @ out q2
243 vbit d24, d5, d6
244 vbit d25, \tmp5, d6
245 vbit d26, \tmp6, d6
246.endif
247.if \wd == 16
2486:
249 vorr d2, d6, d7
250 vmov r2, r3, d2
251 orr r2, r2, r3
252 cmp r2, #0
253 @ If no pixels needed flat8in nor flat8out, jump to a
254 @ writeout of the inner 4 pixels
255 beq 7f
256 vmov r2, r3, d7
257 orr r2, r2, r3
258 cmp r2, #0
259 @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
260 beq 8f
261
262 @ flat8out
263 @ This writes all outputs into d2-d17 (skipping d6 and d16).
264 @ If this part is skipped, the output is read from d21-d26 (which is the input
265 @ to this section).
266 vshll.u8 q0, d16, #3 @ 8 * d16
267 vsubw.u8 q0, q0, d16 @ 7 * d16
268 vaddw.u8 q0, q0, d17
269 vaddl.u8 q4, d17, d18
270 vaddl.u8 q5, d19, d20
271 vadd.s16 q0, q0, q4
272 vaddl.u8 q4, d16, d17
273 vaddl.u8 q6, d21, d22
274 vadd.s16 q0, q0, q5
275 vaddl.u8 q5, d18, d25
276 vaddl.u8 q7, d23, d24
277 vsub.s16 q5, q5, q4
278 vadd.s16 q0, q0, q6
279 vadd.s16 q0, q0, q7
280 vaddl.u8 q6, d16, d18
281 vaddl.u8 q7, d19, d26
282 vrshrn.u16 d2, q0, #4
283
284 vadd.s16 q0, q0, q5
285 vaddl.u8 q4, d16, d19
286 vaddl.u8 q5, d20, d27
287 vsub.s16 q7, q7, q6
288 vbif d2, d17, d7
289 vrshrn.u16 d3, q0, #4
290
291 vadd.s16 q0, q0, q7
292 vaddl.u8 q6, d16, d20
293 vaddl.u8 q7, d21, d28
294 vsub.s16 q5, q5, q4
295 vbif d3, d18, d7
296 vrshrn.u16 d4, q0, #4
297
298 vadd.s16 q0, q0, q5
299 vaddl.u8 q4, d16, d21
300 vaddl.u8 q5, d22, d29
301 vsub.s16 q7, q7, q6
302 vbif d4, d19, d7
303 vrshrn.u16 d5, q0, #4
304
305 vadd.s16 q0, q0, q7
306 vaddl.u8 q6, d16, d22
307 vaddl.u8 q7, d23, d30
308 vsub.s16 q5, q5, q4
309 vbif d5, d20, d7
310 vrshrn.u16 d6, q0, #4
311
312 vadd.s16 q0, q0, q5
313 vaddl.u8 q5, d16, d23
314 vsub.s16 q7, q7, q6
315 vaddl.u8 q6, d24, d31
316 vbif d6, d21, d7
317 vrshrn.u16 d8, q0, #4
318
319 vadd.s16 q0, q0, q7
320 vsub.s16 q5, q6, q5
321 vaddl.u8 q6, d17, d24
322 vaddl.u8 q7, d25, d31
323 vbif d8, d22, d7
324 vrshrn.u16 d9, q0, #4
325
326 vadd.s16 q0, q0, q5
327 vsub.s16 q7, q7, q6
328 vaddl.u8 q6, d26, d31
329 vbif d9, d23, d7
330 vrshrn.u16 d10, q0, #4
331
332 vadd.s16 q0, q0, q7
333 vaddl.u8 q7, d18, d25
334 vaddl.u8 q9, d19, d26
335 vsub.s16 q6, q6, q7
336 vaddl.u8 q7, d27, d31
337 vbif d10, d24, d7
338 vrshrn.u16 d11, q0, #4
339
340 vadd.s16 q0, q0, q6
341 vaddl.u8 q6, d20, d27
342 vsub.s16 q7, q7, q9
343 vaddl.u8 q9, d28, d31
344 vbif d11, d25, d7
345 vsub.s16 q9, q9, q6
346 vrshrn.u16 d12, q0, #4
347
348 vadd.s16 q0, q0, q7
349 vaddl.u8 q7, d21, d28
350 vaddl.u8 q10, d29, d31
351 vbif d12, d26, d7
352 vrshrn.u16 d13, q0, #4
353
354 vadd.s16 q0, q0, q9
355 vsub.s16 q10, q10, q7
356 vaddl.u8 q9, d22, d29
357 vaddl.u8 q11, d30, d31
358 vbif d13, d27, d7
359 vrshrn.u16 d14, q0, #4
360
361 vadd.s16 q0, q0, q10
362 vsub.s16 q11, q11, q9
363 vbif d14, d28, d7
364 vrshrn.u16 d15, q0, #4
365
366 vadd.s16 q0, q0, q11
367 vbif d15, d29, d7
368 vrshrn.u16 d17, q0, #4
369 vbif d17, d30, d7
370.endif
371.endm
372
373@ For wd <= 8, we use d16-d19 and d28-d31 for temp registers,
374@ while we need those for inputs/outputs in wd=16 and use d8-d15
375@ for temp registers there instead.
376.macro loop_filter_4
377 loop_filter 4, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
378.endm
379
380.macro loop_filter_8
381 loop_filter 8, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
382.endm
383
384.macro loop_filter_16
385 loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15, q4, q5, q6, q7
386.endm
387
388
389@ The public functions in this file have got the following signature:
390@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
391
392function ff_vp9_loop_filter_v_4_8_neon, export=1
393 sub r12, r0, r1, lsl #2
394 vld1.8 {d20}, [r12,:64], r1 @ p3
395 vld1.8 {d24}, [r0, :64], r1 @ q0
396 vld1.8 {d21}, [r12,:64], r1 @ p2
397 vld1.8 {d25}, [r0, :64], r1 @ q1
398 vld1.8 {d22}, [r12,:64], r1 @ p1
399 vld1.8 {d26}, [r0, :64], r1 @ q2
400 vld1.8 {d23}, [r12,:64], r1 @ p0
401 vld1.8 {d27}, [r0, :64], r1 @ q3
402 sub r0, r0, r1, lsl #2
403 sub r12, r12, r1, lsl #1
404
405 loop_filter_4
406
407 vst1.8 {d22}, [r12,:64], r1
408 vst1.8 {d24}, [r0, :64], r1
409 vst1.8 {d23}, [r12,:64], r1
410 vst1.8 {d25}, [r0, :64], r1
4119:
412 bx lr
413endfunc
414
415function ff_vp9_loop_filter_h_4_8_neon, export=1
416 sub r12, r0, #4
417 add r0, r12, r1, lsl #2
418 vld1.8 {d20}, [r12], r1
419 vld1.8 {d24}, [r0], r1
420 vld1.8 {d21}, [r12], r1
421 vld1.8 {d25}, [r0], r1
422 vld1.8 {d22}, [r12], r1
423 vld1.8 {d26}, [r0], r1
424 vld1.8 {d23}, [r12], r1
425 vld1.8 {d27}, [r0], r1
426
427 sub r12, r12, r1, lsl #2
428 sub r0, r0, r1, lsl #2
429 @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
430 @ outermost 2 pixels since they aren't changed.
431 add r12, r12, #2
432 add r0, r0, #2
433
434 @ Transpose the 8x8 pixels, taking advantage of q registers, to get
435 @ one register per column.
436 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
437
438 loop_filter_4
439
440 @ We only will write the mid 4 pixels back; after the loop filter,
441 @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows
442 @ (8x4 pixels). We need to transpose them to columns, done with a
443 @ 4x4 transpose (which in practice is two 4x4 transposes of the two
444 @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
445 transpose_q_4x4 q11, q12, d22, d23, d24, d25
446
447 vst1.32 {d22[0]}, [r12], r1
448 vst1.32 {d22[1]}, [r0], r1
449 vst1.32 {d23[0]}, [r12], r1
450 vst1.32 {d23[1]}, [r0], r1
451 vst1.32 {d24[0]}, [r12], r1
452 vst1.32 {d24[1]}, [r0], r1
453 vst1.32 {d25[0]}, [r12], r1
454 vst1.32 {d25[1]}, [r0], r1
4559:
456 bx lr
457endfunc
458
459function ff_vp9_loop_filter_v_8_8_neon, export=1
460 sub r12, r0, r1, lsl #2
461 vld1.8 {d20}, [r12,:64], r1 @ p3
462 vld1.8 {d24}, [r0, :64], r1 @ q0
463 vld1.8 {d21}, [r12,:64], r1 @ p2
464 vld1.8 {d25}, [r0, :64], r1 @ q1
465 vld1.8 {d22}, [r12,:64], r1 @ p1
466 vld1.8 {d26}, [r0, :64], r1 @ q2
467 vld1.8 {d23}, [r12,:64], r1 @ p0
468 vld1.8 {d27}, [r0, :64], r1 @ q3
469 sub r12, r12, r1, lsl #2
470 sub r0, r0, r1, lsl #2
471 add r12, r12, r1
472
473 loop_filter_8
474
475 vst1.8 {d21}, [r12,:64], r1
476 vst1.8 {d24}, [r0, :64], r1
477 vst1.8 {d22}, [r12,:64], r1
478 vst1.8 {d25}, [r0, :64], r1
479 vst1.8 {d23}, [r12,:64], r1
480 vst1.8 {d26}, [r0, :64], r1
4819:
482 bx lr
4836:
484 sub r12, r0, r1, lsl #1
485 vst1.8 {d22}, [r12,:64], r1
486 vst1.8 {d24}, [r0, :64], r1
487 vst1.8 {d23}, [r12,:64], r1
488 vst1.8 {d25}, [r0, :64], r1
489 bx lr
490endfunc
491
492function ff_vp9_loop_filter_h_8_8_neon, export=1
493 sub r12, r0, #4
494 add r0, r12, r1, lsl #2
495 vld1.8 {d20}, [r12], r1
496 vld1.8 {d24}, [r0], r1
497 vld1.8 {d21}, [r12], r1
498 vld1.8 {d25}, [r0], r1
499 vld1.8 {d22}, [r12], r1
500 vld1.8 {d26}, [r0], r1
501 vld1.8 {d23}, [r12], r1
502 vld1.8 {d27}, [r0], r1
503
504 sub r12, r12, r1, lsl #2
505 sub r0, r0, r1, lsl #2
506
507 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
508
509 loop_filter_8
510
511 @ Even though only 6 pixels per row have been changed, we write the
512 @ full 8 pixel registers.
513 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
514
515 vst1.8 {d20}, [r12], r1
516 vst1.8 {d24}, [r0], r1
517 vst1.8 {d21}, [r12], r1
518 vst1.8 {d25}, [r0], r1
519 vst1.8 {d22}, [r12], r1
520 vst1.8 {d26}, [r0], r1
521 vst1.8 {d23}, [r12], r1
522 vst1.8 {d27}, [r0], r1
5239:
524 bx lr
5256:
526 @ If we didn't need to do the flat8in part, we use the same writeback
527 @ as in loop_filter_h_4_8.
528 add r12, r12, #2
529 add r0, r0, #2
530 transpose_q_4x4 q11, q12, d22, d23, d24, d25
531 vst1.32 {d22[0]}, [r12], r1
532 vst1.32 {d22[1]}, [r0], r1
533 vst1.32 {d23[0]}, [r12], r1
534 vst1.32 {d23[1]}, [r0], r1
535 vst1.32 {d24[0]}, [r12], r1
536 vst1.32 {d24[1]}, [r0], r1
537 vst1.32 {d25[0]}, [r12], r1
538 vst1.32 {d25[1]}, [r0], r1
539 bx lr
540endfunc
541
542function vp9_loop_filter_v_16_neon
543 sub r12, r0, r1, lsl #3
544 @ Read p7-p0 using r12 and q0-q7 using r0
545 vld1.8 {d16}, [r12,:64], r1 @ p7
546 vld1.8 {d24}, [r0, :64], r1 @ q0
547 vld1.8 {d17}, [r12,:64], r1 @ p6
548 vld1.8 {d25}, [r0, :64], r1 @ q1
549 vld1.8 {d18}, [r12,:64], r1 @ p5
550 vld1.8 {d26}, [r0, :64], r1 @ q2
551 vld1.8 {d19}, [r12,:64], r1 @ p4
552 vld1.8 {d27}, [r0, :64], r1 @ q3
553 vld1.8 {d20}, [r12,:64], r1 @ p3
554 vld1.8 {d28}, [r0, :64], r1 @ q4
555 vld1.8 {d21}, [r12,:64], r1 @ p2
556 vld1.8 {d29}, [r0, :64], r1 @ q5
557 vld1.8 {d22}, [r12,:64], r1 @ p1
558 vld1.8 {d30}, [r0, :64], r1 @ q6
559 vld1.8 {d23}, [r12,:64], r1 @ p0
560 vld1.8 {d31}, [r0, :64], r1 @ q7
561 sub r12, r12, r1, lsl #3
562 sub r0, r0, r1, lsl #3
563 add r12, r12, r1
564
565 loop_filter_16
566
567 @ If we did the flat8out part, we get the output in
568 @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
569 @ store d2-d9 there, and d10-d17 into r0.
570 vst1.8 {d2}, [r12,:64], r1
571 vst1.8 {d10}, [r0, :64], r1
572 vst1.8 {d3}, [r12,:64], r1
573 vst1.8 {d11}, [r0, :64], r1
574 vst1.8 {d4}, [r12,:64], r1
575 vst1.8 {d12}, [r0, :64], r1
576 vst1.8 {d5}, [r12,:64], r1
577 vst1.8 {d13}, [r0, :64], r1
578 vst1.8 {d6}, [r12,:64], r1
579 vst1.8 {d14}, [r0, :64], r1
580 vst1.8 {d8}, [r12,:64], r1
581 vst1.8 {d15}, [r0, :64], r1
582 vst1.8 {d9}, [r12,:64], r1
583 vst1.8 {d17}, [r0, :64], r1
584 sub r0, r0, r1, lsl #3
585 add r0, r0, r1
586
5879:
588 bx lr
589
5908:
591 add r12, r12, r1, lsl #2
592 @ If we didn't do the flat8out part, the output is left in the
593 @ input registers.
594 vst1.8 {d21}, [r12,:64], r1
595 vst1.8 {d24}, [r0, :64], r1
596 vst1.8 {d22}, [r12,:64], r1
597 vst1.8 {d25}, [r0, :64], r1
598 vst1.8 {d23}, [r12,:64], r1
599 vst1.8 {d26}, [r0, :64], r1
600 sub r0, r0, r1, lsl #1
601 sub r0, r0, r1
602 bx lr
6037:
604 sub r12, r0, r1, lsl #1
605 vst1.8 {d22}, [r12,:64], r1
606 vst1.8 {d24}, [r0, :64], r1
607 vst1.8 {d23}, [r12,:64], r1
608 vst1.8 {d25}, [r0, :64], r1
609 sub r0, r0, r1, lsl #1
610 bx lr
611endfunc
612
613function ff_vp9_loop_filter_v_16_8_neon, export=1
614 ldr r12, [sp]
615 push {lr}
616 vpush {q4-q7}
617 push {r12}
618 bl vp9_loop_filter_v_16_neon
619 add sp, sp, #4
620 vpop {q4-q7}
621 pop {pc}
622endfunc
623
624function ff_vp9_loop_filter_v_16_16_neon, export=1
625 ldr r12, [sp]
626 // The filter clobbers r2 and r3, but we need to keep them for the second round
627 push {r2, r3, lr}
628 vpush {q4-q7}
629 push {r12}
630 bl vp9_loop_filter_v_16_neon
631 add r0, #8
632 ldr r2, [sp, #68]
633 ldr r3, [sp, #72]
634 bl vp9_loop_filter_v_16_neon
635 add sp, sp, #4
636 vpop {q4-q7}
637 pop {r2, r3, pc}
638endfunc
639
640function vp9_loop_filter_h_16_neon
641 sub r12, r0, #8
642 vld1.8 {d16}, [r12,:64], r1
643 vld1.8 {d24}, [r0, :64], r1
644 vld1.8 {d17}, [r12,:64], r1
645 vld1.8 {d25}, [r0, :64], r1
646 vld1.8 {d18}, [r12,:64], r1
647 vld1.8 {d26}, [r0, :64], r1
648 vld1.8 {d19}, [r12,:64], r1
649 vld1.8 {d27}, [r0, :64], r1
650 vld1.8 {d20}, [r12,:64], r1
651 vld1.8 {d28}, [r0, :64], r1
652 vld1.8 {d21}, [r12,:64], r1
653 vld1.8 {d29}, [r0, :64], r1
654 vld1.8 {d22}, [r12,:64], r1
655 vld1.8 {d30}, [r0, :64], r1
656 vld1.8 {d23}, [r12,:64], r1
657 vld1.8 {d31}, [r0, :64], r1
658 sub r0, r0, r1, lsl #3
659 sub r12, r12, r1, lsl #3
660
661 @ The 16x8 pixels read above is in two 8x8 blocks; the left
662 @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes
663 @ of this, to get one column per register. This could be done with two
664 @ transpose_8x8 as below, but this takes advantage of the q registers.
665 transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
666 vtrn.8 d16, d17
667 vtrn.8 d18, d19
668 vtrn.8 d20, d21
669 vtrn.8 d22, d23
670 vtrn.8 d24, d25
671 vtrn.8 d26, d27
672 vtrn.8 d28, d29
673 vtrn.8 d30, d31
674
675 loop_filter_16
676
677 @ Transpose back; this is the same transpose as above, but
678 @ we can't take advantage of q registers for the transpose, since
679 @ all d registers in the transpose aren't consecutive.
680 transpose_8x8 d16, d2, d3, d4, d5, d6, d8, d9
681 transpose_8x8 d10, d11, d12, d13, d14, d15, d17, d31
682
683 vst1.8 {d16}, [r12,:64], r1
684 vst1.8 {d10}, [r0, :64], r1
685
686 vst1.8 {d2}, [r12,:64], r1
687 vst1.8 {d11}, [r0, :64], r1
688
689 vst1.8 {d3}, [r12,:64], r1
690 vst1.8 {d12}, [r0, :64], r1
691
692 vst1.8 {d4}, [r12,:64], r1
693 vst1.8 {d13}, [r0, :64], r1
694
695 vst1.8 {d5}, [r12,:64], r1
696 vst1.8 {d14}, [r0, :64], r1
697
698 vst1.8 {d6}, [r12,:64], r1
699 vst1.8 {d15}, [r0, :64], r1
700
701 vst1.8 {d8}, [r12,:64], r1
702 vst1.8 {d17}, [r0, :64], r1
703
704 vst1.8 {d9}, [r12,:64], r1
705 vst1.8 {d31}, [r0, :64], r1
706 sub r0, r0, r1, lsl #3
7079:
708 bx lr
7098:
710 @ The same writeback as in loop_filter_h_8_8
711 sub r12, r0, #4
712 add r0, r12, r1, lsl #2
713 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
714
715 vst1.8 {d20}, [r12], r1
716 vst1.8 {d24}, [r0], r1
717 vst1.8 {d21}, [r12], r1
718 vst1.8 {d25}, [r0], r1
719 vst1.8 {d22}, [r12], r1
720 vst1.8 {d26}, [r0], r1
721 vst1.8 {d23}, [r12], r1
722 vst1.8 {d27}, [r0], r1
723 sub r0, r0, r1, lsl #3
724 add r0, r0, #4
725 bx lr
7267:
727 @ The same writeback as in loop_filter_h_4_8
728 sub r12, r0, #2
729 add r0, r12, r1, lsl #2
730 transpose_q_4x4 q11, q12, d22, d23, d24, d25
731 vst1.32 {d22[0]}, [r12], r1
732 vst1.32 {d22[1]}, [r0], r1
733 vst1.32 {d23[0]}, [r12], r1
734 vst1.32 {d23[1]}, [r0], r1
735 vst1.32 {d24[0]}, [r12], r1
736 vst1.32 {d24[1]}, [r0], r1
737 vst1.32 {d25[0]}, [r12], r1
738 vst1.32 {d25[1]}, [r0], r1
739 sub r0, r0, r1, lsl #3
740 add r0, r0, #2
741 bx lr
742endfunc
743
744function ff_vp9_loop_filter_h_16_8_neon, export=1
745 ldr r12, [sp]
746 push {lr}
747 vpush {q4-q7}
748 push {r12}
749 bl vp9_loop_filter_h_16_neon
750 add sp, sp, #4
751 vpop {q4-q7}
752 pop {pc}
753endfunc
754
755function ff_vp9_loop_filter_h_16_16_neon, export=1
756 ldr r12, [sp]
757 // The filter clobbers r2 and r3, but we need to keep them for the second round
758 push {r2, r3, lr}
759 vpush {q4-q7}
760 push {r12}
761 bl vp9_loop_filter_h_16_neon
762 add r0, r0, r1, lsl #3
763 ldr r2, [sp, #68]
764 ldr r3, [sp, #72]
765 bl vp9_loop_filter_h_16_neon
766 add sp, sp, #4
767 vpop {q4-q7}
768 pop {r2, r3, pc}
769endfunc