arm: vp9lpf: Implement the mix2_44 function with one single filter pass
[libav.git] / libavcodec / arm / vp9lpf_neon.S
1 /*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/arm/asm.S"
22 #include "neon.S"
23
24 @ Do an 8x8 transpose, using q registers for the subtransposes that don't
25 @ need to address the indiviudal d registers.
26 @ r0,r1 == rq0, r2,r3 == rq1, etc
27 .macro transpose_q_8x8 rq0, rq1, rq2, rq3, r0, r1, r2, r3, r4, r5, r6, r7
28 vtrn.32 \rq0, \rq2
29 vtrn.32 \rq1, \rq3
30 vtrn.16 \rq0, \rq1
31 vtrn.16 \rq2, \rq3
32 vtrn.8 \r0, \r1
33 vtrn.8 \r2, \r3
34 vtrn.8 \r4, \r5
35 vtrn.8 \r6, \r7
36 .endm
37
38 @ Do a 4x4 transpose, using q registers for the subtransposes that don't
39 @ need to address the indiviudal d registers.
40 @ r0,r1 == rq0, r2,r3 == rq1
41 .macro transpose_q_4x4 rq0, rq1, r0, r1, r2, r3
42 vtrn.16 \rq0, \rq1
43 vtrn.8 \r0, \r1
44 vtrn.8 \r2, \r3
45 .endm
46
47 @ The input to and output from this macro is in the registers q8-q15,
48 @ and q0-q7 are used as scratch registers.
49 @ p3 = q8, p0 = q11, q0 = q12, q3 = q15
50 .macro loop_filter_q
51 vdup.u8 d0, r2 @ E
52 lsr r2, r2, #8
53 vdup.u8 d2, r3 @ I
54 lsr r3, r3, #8
55 vdup.u8 d1, r2 @ E
56 vdup.u8 d3, r3 @ I
57
58 vabd.u8 q2, q8, q9 @ abs(p3 - p2)
59 vabd.u8 q3, q9, q10 @ abs(p2 - p1)
60 vabd.u8 q4, q10, q11 @ abs(p1 - p0)
61 vabd.u8 q5, q12, q13 @ abs(q0 - q1)
62 vabd.u8 q6, q13, q14 @ abs(q1 - q2)
63 vabd.u8 q7, q14, q15 @ abs(q2 - q3)
64 vmax.u8 q2, q2, q3
65 vmax.u8 q3, q4, q5
66 vmax.u8 q4, q6, q7
67 vabd.u8 q5, q11, q12 @ abs(p0 - q0)
68 vmax.u8 q2, q2, q3
69 vqadd.u8 q5, q5, q5 @ abs(p0 - q0) * 2
70 vabd.u8 q7, q10, q13 @ abs(p1 - q1)
71 vmax.u8 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3))
72 vshr.u8 q7, q7, #1
73 vcle.u8 q2, q2, q1 @ max(abs()) <= I
74 vqadd.u8 q5, q5, q7 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
75 vcle.u8 q5, q5, q0
76 vand q2, q2, q5 @ fm
77
78 vshrn.u16 d10, q2, #4
79 vmov r2, r3, d10
80 orrs r2, r2, r3
81 @ If no pixels need filtering, just exit as soon as possible
82 beq 9f
83
84 @ Calculate the normal inner loop filter for 2 or 4 pixels
85 ldr r3, [sp, #64]
86 vabd.u8 q3, q10, q11 @ abs(p1 - p0)
87 vabd.u8 q4, q13, q12 @ abs(q1 - q0)
88
89 vsubl.u8 q5, d20, d26 @ p1 - q1
90 vsubl.u8 q6, d21, d27 @ p1 - q1
91 vmax.u8 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0))
92 vqmovn.s16 d10, q5 @ av_clip_int8p(p1 - q1)
93 vqmovn.s16 d11, q6 @ av_clip_int8p(p1 - q1)
94 vdup.u8 d8, r3 @ H
95 lsr r3, r3, #8
96 vdup.u8 d9, r3 @ H
97 vsubl.u8 q6, d24, d22 @ q0 - p0
98 vsubl.u8 q7, d25, d23 @ q0 - p0
99 vcle.u8 q3, q3, q4 @ hev
100 vmov.s16 q0, #3
101 vand q3, q3, q2 @ !hev && fm && !flat8in
102
103 vmul.s16 q6, q6, q0 @ 3 * (q0 - p0)
104 vmul.s16 q7, q7, q0 @ 3 * (q0 - p0)
105 vbic q5, q5, q3 @ if (!hev) av_clip_int8 = 0
106 vaddw.s8 q6, q6, d10 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
107 vaddw.s8 q7, q7, d11 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
108 vmov.s8 q5, #4
109 vqmovn.s16 d12, q6
110 vqmovn.s16 d13, q7 @ av_clip_int8(3 * (q0 - p0) [+ av_clip_int8(p1 - q1)], BIT_DEPTH - 1) = f
111 vmov.s8 q0, #3
112
113 vqadd.s8 q5, q6, q5 @ FFMIN(f + 4, 127)
114 vqadd.s8 q0, q6, q0 @ FFMIN(f + 3, 127)
115 vmovl.u8 q6, d22 @ p0
116 vmovl.u8 q7, d23 @ p0
117 vshr.s8 q5, q5, #3 @ f1
118 vshr.s8 q0, q0, #3 @ f2
119
120 vaddw.s8 q6, q6, d0 @ p0 + f2
121 vaddw.s8 q7, q7, d1 @ p0 + f2
122 vqmovun.s16 d0, q6 @ out p0
123 vmovl.u8 q6, d24 @ q0
124 vqmovun.s16 d1, q7 @ out p0
125 vmovl.u8 q7, d25 @ q0
126 vsubw.s8 q6, q6, d10 @ q0 - f1
127 vsubw.s8 q7, q7, d11 @ q0 - f1
128 vqmovun.s16 d12, q6 @ out q0
129 vqmovun.s16 d13, q7 @ out q0
130 vrshr.s8 q5, q5, #1 @ f = (f1 + 1) >> 1
131 vbit q11, q0, q2 @ if (fm && !flat8in)
132 vbit q12, q6, q2
133
134 vmovl.u8 q0, d20 @ p1
135 vmovl.u8 q2, d21 @ p1
136 vmovl.u8 q6, d26 @ q1
137 vmovl.u8 q7, d27 @ q1
138 vaddw.s8 q0, q0, d10 @ p1 + f
139 vaddw.s8 q2, q2, d11 @ p1 + f
140 vsubw.s8 q6, q6, d10 @ q1 - f
141 vsubw.s8 q7, q7, d11 @ q1 - f
142 vqmovun.s16 d0, q0 @ out p1
143 vqmovun.s16 d1, q2 @ out p1
144 vqmovun.s16 d12, q6 @ out q1
145 vqmovun.s16 d13, q7 @ out q1
146 vbit q10, q0, q3 @ if (!hev && fm && !flat8in)
147 vbit q13, q6, q3
148 .endm
149
150 @ The input to and output from this macro is in the registers d16-d31,
151 @ and d0-d7 are used as scratch registers.
152 @ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
153 @ Depending on the width of the loop filter, we either use d16-d19
154 @ and d28-d31 as temp registers, or d8-d15.
155 @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
156 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
157 vdup.u8 d0, r2 @ E
158 vdup.u8 d2, r3 @ I
159 ldr r3, [sp]
160
161 vabd.u8 d4, d20, d21 @ abs(p3 - p2)
162 vabd.u8 d5, d21, d22 @ abs(p2 - p1)
163 vabd.u8 d6, d22, d23 @ abs(p1 - p0)
164 vabd.u8 d7, d24, d25 @ abs(q0 - q1)
165 vabd.u8 \tmp1, d25, d26 @ abs(q1 - q2)
166 vabd.u8 \tmp2, d26, d27 @ abs(q2 - q3)
167 vmax.u8 d4, d4, d5
168 vmax.u8 d5, d6, d7
169 vmax.u8 \tmp1, \tmp1, \tmp2
170 vabd.u8 d6, d23, d24 @ abs(p0 - q0)
171 vmax.u8 d4, d4, d5
172 vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2
173 vabd.u8 d5, d22, d25 @ abs(p1 - q1)
174 vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3))
175 vshr.u8 d5, d5, #1
176 vcle.u8 d4, d4, d2 @ max(abs()) <= I
177 vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
178 vcle.u8 d5, d6, d0
179 vand d4, d4, d5 @ fm
180
181 vdup.u8 d3, r3 @ H
182 vmov r2, r3, d4
183 orrs r2, r2, r3
184 @ If no pixels need filtering, just exit as soon as possible
185 beq 9f
186
187 .if \wd >= 8
188 vmov.u8 d0, #1
189
190 vabd.u8 d6, d20, d23 @ abs(p3 - p0)
191 vabd.u8 d2, d21, d23 @ abs(p2 - p0)
192 vabd.u8 d1, d22, d23 @ abs(p1 - p0)
193 vabd.u8 \tmp1, d25, d24 @ abs(q1 - q0)
194 vabd.u8 \tmp2, d26, d24 @ abs(q2 - q0)
195 vabd.u8 \tmp3, d27, d24 @ abs(q3 - q0)
196 vmax.u8 d6, d6, d2
197 vmax.u8 d1, d1, \tmp1
198 vmax.u8 \tmp2, \tmp2, \tmp3
199 .if \wd == 16
200 vabd.u8 d7, d16, d23 @ abs(p7 - p0)
201 vmax.u8 d6, d6, d1
202 vabd.u8 d2, d17, d23 @ abs(p6 - p0)
203 vmax.u8 d6, d6, \tmp2
204 vabd.u8 d1, d18, d23 @ abs(p5 - p0)
205 vcle.u8 d6, d6, d0 @ flat8in
206 vabd.u8 d8, d19, d23 @ abs(p4 - p0)
207 vand d6, d6, d4 @ flat8in && fm
208 vabd.u8 d9, d28, d24 @ abs(q4 - q0)
209 vbic d4, d4, d6 @ fm && !flat8in
210 vabd.u8 d10, d29, d24 @ abs(q5 - q0)
211 vabd.u8 d11, d30, d24 @ abs(q6 - q0)
212 vabd.u8 d12, d31, d24 @ abs(q7 - q0)
213
214 vmax.u8 d7, d7, d2
215 vmax.u8 d1, d1, d8
216 vmax.u8 d9, d9, d10
217 vmax.u8 d11, d11, d12
218 @ The rest of the calculation of flat8out is interleaved below
219 .else
220 @ The rest of the calculation of flat8in is interleaved below
221 .endif
222 .endif
223
224 @ Calculate the normal inner loop filter for 2 or 4 pixels
225 vabd.u8 d5, d22, d23 @ abs(p1 - p0)
226 .if \wd == 16
227 vmax.u8 d7, d7, d1
228 vmax.u8 d9, d9, d11
229 .elseif \wd == 8
230 vmax.u8 d6, d6, d1
231 .endif
232 vabd.u8 d1, d25, d24 @ abs(q1 - q0)
233 .if \wd == 16
234 vmax.u8 d7, d7, d9
235 .elseif \wd == 8
236 vmax.u8 d6, d6, \tmp2
237 .endif
238 vsubl.u8 \tmpq1, d22, d25 @ p1 - q1
239 vmax.u8 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0))
240 vsubl.u8 \tmpq2, d24, d23 @ q0 - p0
241 vmov.s16 \tmpq3, #3
242 .if \wd == 8
243 vcle.u8 d6, d6, d0 @ flat8in
244 .endif
245 vcle.u8 d5, d5, d3 @ !hev
246 .if \wd == 8
247 vand d6, d6, d4 @ flat8in && fm
248 .endif
249 vqmovn.s16 \tmp1, \tmpq1 @ av_clip_int8(p1 - q1)
250 .if \wd == 16
251 vcle.u8 d7, d7, d0 @ flat8out
252 .elseif \wd == 8
253 vbic d4, d4, d6 @ fm && !flat8in
254 .endif
255 vand d5, d5, d4 @ !hev && fm && !flat8in
256 .if \wd == 16
257 vand d7, d7, d6 @ flat8out && flat8in && fm
258 .endif
259
260 vmul.s16 \tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0)
261 vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int8 = 0
262 vmov.s8 d2, #4
263 vaddw.s8 \tmpq2, \tmpq2, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
264 vmov.s8 d3, #3
265 vqmovn.s16 \tmp1, \tmpq2 @ f
266 .if \wd == 16
267 vbic d6, d6, d7 @ fm && flat8in && !flat8out
268 .endif
269
270 vqadd.s8 \tmp3, \tmp1, d2 @ FFMIN(f + 4, 127)
271 vqadd.s8 \tmp4, \tmp1, d3 @ FFMIN(f + 3, 127)
272 vmovl.u8 q0, d23 @ p0
273 vshr.s8 \tmp3, \tmp3, #3 @ f1
274 vshr.s8 \tmp4, \tmp4, #3 @ f2
275
276 vmovl.u8 q1, d24 @ q0
277 vaddw.s8 q0, q0, \tmp4 @ p0 + f2
278 vsubw.s8 q1, q1, \tmp3 @ q0 - f1
279 vqmovun.s16 d0, q0 @ out p0
280 vqmovun.s16 d1, q1 @ out q0
281 vrshr.s8 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1
282 vbit d23, d0, d4 @ if (fm && !flat8in)
283 vbit d24, d1, d4
284
285 vmovl.u8 q0, d22 @ p1
286 vmovl.u8 q1, d25 @ q1
287 .if \wd >= 8
288 vmov r2, r3, d6
289 .endif
290 vaddw.s8 q0, q0, \tmp3 @ p1 + f
291 vsubw.s8 q1, q1, \tmp3 @ q1 - f
292 .if \wd >= 8
293 orrs r2, r2, r3
294 .endif
295 vqmovun.s16 d0, q0 @ out p1
296 vqmovun.s16 d2, q1 @ out q1
297 vbit d22, d0, d5 @ if (!hev && fm && !flat8in)
298 vbit d25, d2, d5
299
300 .if \wd >= 8
301 @ If no pixels need flat8in, jump to flat8out
302 @ (or to a writeout of the inner 4 pixels, for wd=8)
303 beq 6f
304
305 @ flat8in
306 vaddl.u8 \tmpq1, d20, d21
307 vaddl.u8 \tmpq2, d22, d25
308 vaddl.u8 \tmpq3, d20, d22
309 vaddl.u8 \tmpq4, d23, d26
310 vadd.u16 q0, \tmpq1, \tmpq1
311 vaddw.u8 q0, q0, d23
312 vaddw.u8 q0, q0, d24
313 vadd.u16 q0, q0, \tmpq3
314 vsub.s16 \tmpq2, \tmpq2, \tmpq1
315 vsub.s16 \tmpq4, \tmpq4, \tmpq3
316 vrshrn.u16 d2, q0, #3 @ out p2
317
318 vadd.u16 q0, q0, \tmpq2
319 vaddl.u8 \tmpq1, d20, d23
320 vaddl.u8 \tmpq2, d24, d27
321 vrshrn.u16 d3, q0, #3 @ out p1
322
323 vadd.u16 q0, q0, \tmpq4
324 vsub.s16 \tmpq2, \tmpq2, \tmpq1
325 vaddl.u8 \tmpq3, d21, d24
326 vaddl.u8 \tmpq4, d25, d27
327 vrshrn.u16 d4, q0, #3 @ out p0
328
329 vadd.u16 q0, q0, \tmpq2
330 vsub.s16 \tmpq4, \tmpq4, \tmpq3
331 vaddl.u8 \tmpq1, d22, d25
332 vaddl.u8 \tmpq2, d26, d27
333 vrshrn.u16 d5, q0, #3 @ out q0
334
335 vadd.u16 q0, q0, \tmpq4
336 vsub.s16 \tmpq2, \tmpq2, \tmpq1
337 vrshrn.u16 \tmp5, q0, #3 @ out q1
338
339 vadd.u16 q0, q0, \tmpq2
340 @ The output here is written back into the input registers. This doesn't
341 @ matter for the flat8out part below, since we only update those pixels
342 @ which won't be touched below.
343 vbit d21, d2, d6
344 vbit d22, d3, d6
345 vbit d23, d4, d6
346 vrshrn.u16 \tmp6, q0, #3 @ out q2
347 vbit d24, d5, d6
348 vbit d25, \tmp5, d6
349 vbit d26, \tmp6, d6
350 .endif
351 .if \wd == 16
352 6:
353 vorr d2, d6, d7
354 vmov r2, r3, d2
355 orrs r2, r2, r3
356 @ If no pixels needed flat8in nor flat8out, jump to a
357 @ writeout of the inner 4 pixels
358 beq 7f
359 vmov r2, r3, d7
360 orrs r2, r2, r3
361 @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
362 beq 8f
363
364 @ flat8out
365 @ This writes all outputs into d2-d17 (skipping d6 and d16).
366 @ If this part is skipped, the output is read from d21-d26 (which is the input
367 @ to this section).
368 vshll.u8 q0, d16, #3 @ 8 * d16
369 vsubw.u8 q0, q0, d16 @ 7 * d16
370 vaddw.u8 q0, q0, d17
371 vaddl.u8 q4, d17, d18
372 vaddl.u8 q5, d19, d20
373 vadd.s16 q0, q0, q4
374 vaddl.u8 q4, d16, d17
375 vaddl.u8 q6, d21, d22
376 vadd.s16 q0, q0, q5
377 vaddl.u8 q5, d18, d25
378 vaddl.u8 q7, d23, d24
379 vsub.s16 q5, q5, q4
380 vadd.s16 q0, q0, q6
381 vadd.s16 q0, q0, q7
382 vaddl.u8 q6, d16, d18
383 vaddl.u8 q7, d19, d26
384 vrshrn.u16 d2, q0, #4
385
386 vadd.s16 q0, q0, q5
387 vaddl.u8 q4, d16, d19
388 vaddl.u8 q5, d20, d27
389 vsub.s16 q7, q7, q6
390 vbif d2, d17, d7
391 vrshrn.u16 d3, q0, #4
392
393 vadd.s16 q0, q0, q7
394 vaddl.u8 q6, d16, d20
395 vaddl.u8 q7, d21, d28
396 vsub.s16 q5, q5, q4
397 vbif d3, d18, d7
398 vrshrn.u16 d4, q0, #4
399
400 vadd.s16 q0, q0, q5
401 vaddl.u8 q4, d16, d21
402 vaddl.u8 q5, d22, d29
403 vsub.s16 q7, q7, q6
404 vbif d4, d19, d7
405 vrshrn.u16 d5, q0, #4
406
407 vadd.s16 q0, q0, q7
408 vaddl.u8 q6, d16, d22
409 vaddl.u8 q7, d23, d30
410 vsub.s16 q5, q5, q4
411 vbif d5, d20, d7
412 vrshrn.u16 d6, q0, #4
413
414 vadd.s16 q0, q0, q5
415 vaddl.u8 q5, d16, d23
416 vsub.s16 q7, q7, q6
417 vaddl.u8 q6, d24, d31
418 vbif d6, d21, d7
419 vrshrn.u16 d8, q0, #4
420
421 vadd.s16 q0, q0, q7
422 vsub.s16 q5, q6, q5
423 vaddl.u8 q6, d17, d24
424 vaddl.u8 q7, d25, d31
425 vbif d8, d22, d7
426 vrshrn.u16 d9, q0, #4
427
428 vadd.s16 q0, q0, q5
429 vsub.s16 q7, q7, q6
430 vaddl.u8 q6, d26, d31
431 vbif d9, d23, d7
432 vrshrn.u16 d10, q0, #4
433
434 vadd.s16 q0, q0, q7
435 vaddl.u8 q7, d18, d25
436 vaddl.u8 q9, d19, d26
437 vsub.s16 q6, q6, q7
438 vaddl.u8 q7, d27, d31
439 vbif d10, d24, d7
440 vrshrn.u16 d11, q0, #4
441
442 vadd.s16 q0, q0, q6
443 vaddl.u8 q6, d20, d27
444 vsub.s16 q7, q7, q9
445 vaddl.u8 q9, d28, d31
446 vbif d11, d25, d7
447 vsub.s16 q9, q9, q6
448 vrshrn.u16 d12, q0, #4
449
450 vadd.s16 q0, q0, q7
451 vaddl.u8 q7, d21, d28
452 vaddl.u8 q10, d29, d31
453 vbif d12, d26, d7
454 vrshrn.u16 d13, q0, #4
455
456 vadd.s16 q0, q0, q9
457 vsub.s16 q10, q10, q7
458 vaddl.u8 q9, d22, d29
459 vaddl.u8 q11, d30, d31
460 vbif d13, d27, d7
461 vrshrn.u16 d14, q0, #4
462
463 vadd.s16 q0, q0, q10
464 vsub.s16 q11, q11, q9
465 vbif d14, d28, d7
466 vrshrn.u16 d15, q0, #4
467
468 vadd.s16 q0, q0, q11
469 vbif d15, d29, d7
470 vrshrn.u16 d17, q0, #4
471 vbif d17, d30, d7
472 .endif
473 .endm
474
475 @ For wd <= 8, we use d16-d19 and d28-d31 for temp registers,
476 @ while we need those for inputs/outputs in wd=16 and use d8-d15
477 @ for temp registers there instead.
478 .macro loop_filter_4
479 loop_filter 4, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
480 .endm
481
482 .macro loop_filter_8
483 loop_filter 8, d16, d17, d18, d19, d28, d29, d30, d31, q8, q9, q14, q15
484 .endm
485
486 .macro loop_filter_16
487 loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15, q4, q5, q6, q7
488 .endm
489
490
491 @ The public functions in this file have got the following signature:
492 @ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
493
494 function ff_vp9_loop_filter_v_4_8_neon, export=1
495 sub r12, r0, r1, lsl #2
496 vld1.8 {d20}, [r12,:64], r1 @ p3
497 vld1.8 {d24}, [r0, :64], r1 @ q0
498 vld1.8 {d21}, [r12,:64], r1 @ p2
499 vld1.8 {d25}, [r0, :64], r1 @ q1
500 vld1.8 {d22}, [r12,:64], r1 @ p1
501 vld1.8 {d26}, [r0, :64], r1 @ q2
502 vld1.8 {d23}, [r12,:64], r1 @ p0
503 vld1.8 {d27}, [r0, :64], r1 @ q3
504 sub r0, r0, r1, lsl #2
505 sub r12, r12, r1, lsl #1
506
507 loop_filter_4
508
509 vst1.8 {d22}, [r12,:64], r1
510 vst1.8 {d24}, [r0, :64], r1
511 vst1.8 {d23}, [r12,:64], r1
512 vst1.8 {d25}, [r0, :64], r1
513 9:
514 bx lr
515 endfunc
516
517 function ff_vp9_loop_filter_h_4_8_neon, export=1
518 sub r12, r0, #4
519 add r0, r12, r1, lsl #2
520 vld1.8 {d20}, [r12], r1
521 vld1.8 {d24}, [r0], r1
522 vld1.8 {d21}, [r12], r1
523 vld1.8 {d25}, [r0], r1
524 vld1.8 {d22}, [r12], r1
525 vld1.8 {d26}, [r0], r1
526 vld1.8 {d23}, [r12], r1
527 vld1.8 {d27}, [r0], r1
528
529 sub r12, r12, r1, lsl #2
530 sub r0, r0, r1, lsl #2
531 @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
532 @ outermost 2 pixels since they aren't changed.
533 add r12, r12, #2
534 add r0, r0, #2
535
536 @ Transpose the 8x8 pixels, taking advantage of q registers, to get
537 @ one register per column.
538 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
539
540 loop_filter_4
541
542 @ We only will write the mid 4 pixels back; after the loop filter,
543 @ these are in d22, d23, d24, d25 (q11, q12), ordered as rows
544 @ (8x4 pixels). We need to transpose them to columns, done with a
545 @ 4x4 transpose (which in practice is two 4x4 transposes of the two
546 @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
547 transpose_q_4x4 q11, q12, d22, d23, d24, d25
548
549 vst1.32 {d22[0]}, [r12], r1
550 vst1.32 {d22[1]}, [r0], r1
551 vst1.32 {d23[0]}, [r12], r1
552 vst1.32 {d23[1]}, [r0], r1
553 vst1.32 {d24[0]}, [r12], r1
554 vst1.32 {d24[1]}, [r0], r1
555 vst1.32 {d25[0]}, [r12], r1
556 vst1.32 {d25[1]}, [r0], r1
557 9:
558 bx lr
559 endfunc
560
561 function ff_vp9_loop_filter_v_44_16_neon, export=1
562 vpush {q4-q7}
563 sub r12, r0, r1, lsl #2
564 vld1.8 {q8}, [r12,:128], r1 @ p3
565 vld1.8 {q12}, [r0, :128], r1 @ q0
566 vld1.8 {q9}, [r12,:128], r1 @ p2
567 vld1.8 {q13}, [r0, :128], r1 @ q1
568 vld1.8 {q10}, [r12,:128], r1 @ p1
569 vld1.8 {q14}, [r0, :128], r1 @ q2
570 vld1.8 {q11}, [r12,:128], r1 @ p0
571 vld1.8 {q15}, [r0, :128], r1 @ q3
572 sub r0, r0, r1, lsl #2
573 sub r12, r12, r1, lsl #1
574
575 loop_filter_q
576
577 vst1.8 {q10}, [r12,:128], r1
578 vst1.8 {q12}, [r0, :128], r1
579 vst1.8 {q11}, [r12,:128], r1
580 vst1.8 {q13}, [r0, :128], r1
581 9:
582 vpop {q4-q7}
583 bx lr
584 endfunc
585
586 function ff_vp9_loop_filter_h_44_16_neon, export=1
587 vpush {q4-q7}
588 sub r12, r0, #4
589 add r0, r12, r1, lsl #2
590 vld1.8 {d16}, [r12], r1
591 vld1.8 {d24}, [r0], r1
592 vld1.8 {d18}, [r12], r1
593 vld1.8 {d26}, [r0], r1
594 vld1.8 {d20}, [r12], r1
595 vld1.8 {d28}, [r0], r1
596 vld1.8 {d22}, [r12], r1
597 vld1.8 {d30}, [r0], r1
598 mov r12, r0
599 add r0, r0, r1, lsl #2
600 vld1.8 {d17}, [r12], r1
601 vld1.8 {d25}, [r0], r1
602 vld1.8 {d19}, [r12], r1
603 vld1.8 {d27}, [r0], r1
604 vld1.8 {d21}, [r12], r1
605 vld1.8 {d29}, [r0], r1
606 vld1.8 {d23}, [r12], r1
607 vld1.8 {d31}, [r0], r1
608
609 @ Transpose the 16x8 pixels, as two 8x8 parts
610 transpose_8x8 q8, q9, q10, q11, q12, q13, q14, q15
611
612 loop_filter_q
613
614 sub r12, r0, r1, lsl #4
615 add r0, r12, r1, lsl #3
616 @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
617 @ outermost 2 pixels since they aren't changed.
618 add r12, r12, #2
619 add r0, r0, #2
620
621 @ We only will write the mid 4 pixels back; after the loop filter,
622 @ these are in q10, q11, q12, q13, ordered as rows (16x4 pixels).
623 @ We need to transpose them to columns, done with a 4x4 transpose
624 @ (which in practice is four 4x4 transposes of the 4x4 blocks of
625 @ the 16x4 pixels; into 4x16 pixels).
626 transpose_4x4 q10, q11, q12, q13
627
628 vst1.32 {d20[0]}, [r12], r1
629 vst1.32 {d21[0]}, [r0], r1
630 vst1.32 {d22[0]}, [r12], r1
631 vst1.32 {d23[0]}, [r0], r1
632 vst1.32 {d24[0]}, [r12], r1
633 vst1.32 {d25[0]}, [r0], r1
634 vst1.32 {d26[0]}, [r12], r1
635 vst1.32 {d27[0]}, [r0], r1
636 vst1.32 {d20[1]}, [r12], r1
637 vst1.32 {d21[1]}, [r0], r1
638 vst1.32 {d22[1]}, [r12], r1
639 vst1.32 {d23[1]}, [r0], r1
640 vst1.32 {d24[1]}, [r12], r1
641 vst1.32 {d25[1]}, [r0], r1
642 vst1.32 {d26[1]}, [r12], r1
643 vst1.32 {d27[1]}, [r0], r1
644 9:
645 vpop {q4-q7}
646 bx lr
647 endfunc
648
649 function ff_vp9_loop_filter_v_8_8_neon, export=1
650 sub r12, r0, r1, lsl #2
651 vld1.8 {d20}, [r12,:64], r1 @ p3
652 vld1.8 {d24}, [r0, :64], r1 @ q0
653 vld1.8 {d21}, [r12,:64], r1 @ p2
654 vld1.8 {d25}, [r0, :64], r1 @ q1
655 vld1.8 {d22}, [r12,:64], r1 @ p1
656 vld1.8 {d26}, [r0, :64], r1 @ q2
657 vld1.8 {d23}, [r12,:64], r1 @ p0
658 vld1.8 {d27}, [r0, :64], r1 @ q3
659 sub r12, r12, r1, lsl #2
660 sub r0, r0, r1, lsl #2
661 add r12, r12, r1
662
663 loop_filter_8
664
665 vst1.8 {d21}, [r12,:64], r1
666 vst1.8 {d24}, [r0, :64], r1
667 vst1.8 {d22}, [r12,:64], r1
668 vst1.8 {d25}, [r0, :64], r1
669 vst1.8 {d23}, [r12,:64], r1
670 vst1.8 {d26}, [r0, :64], r1
671 9:
672 bx lr
673 6:
674 sub r12, r0, r1, lsl #1
675 vst1.8 {d22}, [r12,:64], r1
676 vst1.8 {d24}, [r0, :64], r1
677 vst1.8 {d23}, [r12,:64], r1
678 vst1.8 {d25}, [r0, :64], r1
679 bx lr
680 endfunc
681
682 function ff_vp9_loop_filter_h_8_8_neon, export=1
683 sub r12, r0, #4
684 add r0, r12, r1, lsl #2
685 vld1.8 {d20}, [r12], r1
686 vld1.8 {d24}, [r0], r1
687 vld1.8 {d21}, [r12], r1
688 vld1.8 {d25}, [r0], r1
689 vld1.8 {d22}, [r12], r1
690 vld1.8 {d26}, [r0], r1
691 vld1.8 {d23}, [r12], r1
692 vld1.8 {d27}, [r0], r1
693
694 sub r12, r12, r1, lsl #2
695 sub r0, r0, r1, lsl #2
696
697 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
698
699 loop_filter_8
700
701 @ Even though only 6 pixels per row have been changed, we write the
702 @ full 8 pixel registers.
703 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
704
705 vst1.8 {d20}, [r12], r1
706 vst1.8 {d24}, [r0], r1
707 vst1.8 {d21}, [r12], r1
708 vst1.8 {d25}, [r0], r1
709 vst1.8 {d22}, [r12], r1
710 vst1.8 {d26}, [r0], r1
711 vst1.8 {d23}, [r12], r1
712 vst1.8 {d27}, [r0], r1
713 9:
714 bx lr
715 6:
716 @ If we didn't need to do the flat8in part, we use the same writeback
717 @ as in loop_filter_h_4_8.
718 add r12, r12, #2
719 add r0, r0, #2
720 transpose_q_4x4 q11, q12, d22, d23, d24, d25
721 vst1.32 {d22[0]}, [r12], r1
722 vst1.32 {d22[1]}, [r0], r1
723 vst1.32 {d23[0]}, [r12], r1
724 vst1.32 {d23[1]}, [r0], r1
725 vst1.32 {d24[0]}, [r12], r1
726 vst1.32 {d24[1]}, [r0], r1
727 vst1.32 {d25[0]}, [r12], r1
728 vst1.32 {d25[1]}, [r0], r1
729 bx lr
730 endfunc
731
732 function vp9_loop_filter_v_16_neon
733 sub r12, r0, r1, lsl #3
734 @ Read p7-p0 using r12 and q0-q7 using r0
735 vld1.8 {d16}, [r12,:64], r1 @ p7
736 vld1.8 {d24}, [r0, :64], r1 @ q0
737 vld1.8 {d17}, [r12,:64], r1 @ p6
738 vld1.8 {d25}, [r0, :64], r1 @ q1
739 vld1.8 {d18}, [r12,:64], r1 @ p5
740 vld1.8 {d26}, [r0, :64], r1 @ q2
741 vld1.8 {d19}, [r12,:64], r1 @ p4
742 vld1.8 {d27}, [r0, :64], r1 @ q3
743 vld1.8 {d20}, [r12,:64], r1 @ p3
744 vld1.8 {d28}, [r0, :64], r1 @ q4
745 vld1.8 {d21}, [r12,:64], r1 @ p2
746 vld1.8 {d29}, [r0, :64], r1 @ q5
747 vld1.8 {d22}, [r12,:64], r1 @ p1
748 vld1.8 {d30}, [r0, :64], r1 @ q6
749 vld1.8 {d23}, [r12,:64], r1 @ p0
750 vld1.8 {d31}, [r0, :64], r1 @ q7
751 sub r12, r12, r1, lsl #3
752 sub r0, r0, r1, lsl #3
753 add r12, r12, r1
754
755 loop_filter_16
756
757 @ If we did the flat8out part, we get the output in
758 @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
759 @ store d2-d9 there, and d10-d17 into r0.
760 vst1.8 {d2}, [r12,:64], r1
761 vst1.8 {d10}, [r0, :64], r1
762 vst1.8 {d3}, [r12,:64], r1
763 vst1.8 {d11}, [r0, :64], r1
764 vst1.8 {d4}, [r12,:64], r1
765 vst1.8 {d12}, [r0, :64], r1
766 vst1.8 {d5}, [r12,:64], r1
767 vst1.8 {d13}, [r0, :64], r1
768 vst1.8 {d6}, [r12,:64], r1
769 vst1.8 {d14}, [r0, :64], r1
770 vst1.8 {d8}, [r12,:64], r1
771 vst1.8 {d15}, [r0, :64], r1
772 vst1.8 {d9}, [r12,:64], r1
773 vst1.8 {d17}, [r0, :64], r1
774 sub r0, r0, r1, lsl #3
775 add r0, r0, r1
776
777 9:
778 bx lr
779
780 8:
781 add r12, r12, r1, lsl #2
782 @ If we didn't do the flat8out part, the output is left in the
783 @ input registers.
784 vst1.8 {d21}, [r12,:64], r1
785 vst1.8 {d24}, [r0, :64], r1
786 vst1.8 {d22}, [r12,:64], r1
787 vst1.8 {d25}, [r0, :64], r1
788 vst1.8 {d23}, [r12,:64], r1
789 vst1.8 {d26}, [r0, :64], r1
790 sub r0, r0, r1, lsl #1
791 sub r0, r0, r1
792 bx lr
793 7:
794 sub r12, r0, r1, lsl #1
795 vst1.8 {d22}, [r12,:64], r1
796 vst1.8 {d24}, [r0, :64], r1
797 vst1.8 {d23}, [r12,:64], r1
798 vst1.8 {d25}, [r0, :64], r1
799 sub r0, r0, r1, lsl #1
800 bx lr
801 endfunc
802
803 function ff_vp9_loop_filter_v_16_8_neon, export=1
804 ldr r12, [sp]
805 push {lr}
806 vpush {q4-q7}
807 push {r12}
808 bl vp9_loop_filter_v_16_neon
809 add sp, sp, #4
810 vpop {q4-q7}
811 pop {pc}
812 endfunc
813
814 function ff_vp9_loop_filter_v_16_16_neon, export=1
815 ldr r12, [sp]
816 // The filter clobbers r2 and r3, but we need to keep them for the second round
817 push {r2, r3, lr}
818 vpush {q4-q7}
819 push {r12}
820 bl vp9_loop_filter_v_16_neon
821 add r0, #8
822 ldr r2, [sp, #68]
823 ldr r3, [sp, #72]
824 bl vp9_loop_filter_v_16_neon
825 add sp, sp, #4
826 vpop {q4-q7}
827 pop {r2, r3, pc}
828 endfunc
829
830 function vp9_loop_filter_h_16_neon
831 sub r12, r0, #8
832 vld1.8 {d16}, [r12,:64], r1
833 vld1.8 {d24}, [r0, :64], r1
834 vld1.8 {d17}, [r12,:64], r1
835 vld1.8 {d25}, [r0, :64], r1
836 vld1.8 {d18}, [r12,:64], r1
837 vld1.8 {d26}, [r0, :64], r1
838 vld1.8 {d19}, [r12,:64], r1
839 vld1.8 {d27}, [r0, :64], r1
840 vld1.8 {d20}, [r12,:64], r1
841 vld1.8 {d28}, [r0, :64], r1
842 vld1.8 {d21}, [r12,:64], r1
843 vld1.8 {d29}, [r0, :64], r1
844 vld1.8 {d22}, [r12,:64], r1
845 vld1.8 {d30}, [r0, :64], r1
846 vld1.8 {d23}, [r12,:64], r1
847 vld1.8 {d31}, [r0, :64], r1
848 sub r0, r0, r1, lsl #3
849 sub r12, r12, r1, lsl #3
850
851 @ The 16x8 pixels read above is in two 8x8 blocks; the left
852 @ half in d16-d23, and the right half in d24-d31. Do two 8x8 transposes
853 @ of this, to get one column per register. This could be done with two
854 @ transpose_8x8 as below, but this takes advantage of the q registers.
855 transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
856 vtrn.8 d16, d17
857 vtrn.8 d18, d19
858 vtrn.8 d20, d21
859 vtrn.8 d22, d23
860 vtrn.8 d24, d25
861 vtrn.8 d26, d27
862 vtrn.8 d28, d29
863 vtrn.8 d30, d31
864
865 loop_filter_16
866
867 @ Transpose back; this is the same transpose as above, but
868 @ we can't take advantage of q registers for the transpose, since
869 @ all d registers in the transpose aren't consecutive.
870 transpose_8x8 d16, d2, d3, d4, d5, d6, d8, d9
871 transpose_8x8 d10, d11, d12, d13, d14, d15, d17, d31
872
873 vst1.8 {d16}, [r12,:64], r1
874 vst1.8 {d10}, [r0, :64], r1
875
876 vst1.8 {d2}, [r12,:64], r1
877 vst1.8 {d11}, [r0, :64], r1
878
879 vst1.8 {d3}, [r12,:64], r1
880 vst1.8 {d12}, [r0, :64], r1
881
882 vst1.8 {d4}, [r12,:64], r1
883 vst1.8 {d13}, [r0, :64], r1
884
885 vst1.8 {d5}, [r12,:64], r1
886 vst1.8 {d14}, [r0, :64], r1
887
888 vst1.8 {d6}, [r12,:64], r1
889 vst1.8 {d15}, [r0, :64], r1
890
891 vst1.8 {d8}, [r12,:64], r1
892 vst1.8 {d17}, [r0, :64], r1
893
894 vst1.8 {d9}, [r12,:64], r1
895 vst1.8 {d31}, [r0, :64], r1
896 sub r0, r0, r1, lsl #3
897 9:
898 bx lr
899 8:
900 @ The same writeback as in loop_filter_h_8_8
901 sub r12, r0, #4
902 add r0, r12, r1, lsl #2
903 transpose_q_8x8 q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
904
905 vst1.8 {d20}, [r12], r1
906 vst1.8 {d24}, [r0], r1
907 vst1.8 {d21}, [r12], r1
908 vst1.8 {d25}, [r0], r1
909 vst1.8 {d22}, [r12], r1
910 vst1.8 {d26}, [r0], r1
911 vst1.8 {d23}, [r12], r1
912 vst1.8 {d27}, [r0], r1
913 sub r0, r0, r1, lsl #3
914 add r0, r0, #4
915 bx lr
916 7:
917 @ The same writeback as in loop_filter_h_4_8
918 sub r12, r0, #2
919 add r0, r12, r1, lsl #2
920 transpose_q_4x4 q11, q12, d22, d23, d24, d25
921 vst1.32 {d22[0]}, [r12], r1
922 vst1.32 {d22[1]}, [r0], r1
923 vst1.32 {d23[0]}, [r12], r1
924 vst1.32 {d23[1]}, [r0], r1
925 vst1.32 {d24[0]}, [r12], r1
926 vst1.32 {d24[1]}, [r0], r1
927 vst1.32 {d25[0]}, [r12], r1
928 vst1.32 {d25[1]}, [r0], r1
929 sub r0, r0, r1, lsl #3
930 add r0, r0, #2
931 bx lr
932 endfunc
933
934 function ff_vp9_loop_filter_h_16_8_neon, export=1
935 ldr r12, [sp]
936 push {lr}
937 vpush {q4-q7}
938 push {r12}
939 bl vp9_loop_filter_h_16_neon
940 add sp, sp, #4
941 vpop {q4-q7}
942 pop {pc}
943 endfunc
944
945 function ff_vp9_loop_filter_h_16_16_neon, export=1
946 ldr r12, [sp]
947 // The filter clobbers r2 and r3, but we need to keep them for the second round
948 push {r2, r3, lr}
949 vpush {q4-q7}
950 push {r12}
951 bl vp9_loop_filter_h_16_neon
952 add r0, r0, r1, lsl #3
953 ldr r2, [sp, #68]
954 ldr r3, [sp, #72]
955 bl vp9_loop_filter_h_16_neon
956 add sp, sp, #4
957 vpop {q4-q7}
958 pop {r2, r3, pc}
959 endfunc