aarch64: vp8: Fix assembling with clang
[libav.git] / libavcodec / aarch64 / vp8dsp_neon.S
1 /*
2 * VP8 NEON optimisations
3 *
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
7 *
8 * This file is part of Libav.
9 *
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #include "libavutil/aarch64/asm.S"
26 #include "neon.S"
27
28 function ff_vp8_idct_add_neon, export=1
29 ld1 {v0.8b - v3.8b}, [x1]
30 mov w4, #20091
31 movk w4, #35468/2, lsl 16
32 dup v4.2s, w4
33
34 smull v26.4s, v1.4h, v4.h[0]
35 smull v27.4s, v3.4h, v4.h[0]
36 sqdmulh v20.4h, v1.4h, v4.h[1]
37 sqdmulh v23.4h, v3.4h, v4.h[1]
38 sqshrn v21.4h, v26.4s, #16
39 sqshrn v22.4h, v27.4s, #16
40 add v21.4h, v21.4h, v1.4h
41 add v22.4h, v22.4h, v3.4h
42
43 add v16.4h, v0.4h, v2.4h
44 sub v17.4h, v0.4h, v2.4h
45
46 add v18.4h, v21.4h, v23.4h
47 sub v19.4h, v20.4h, v22.4h
48
49 add v0.4h, v16.4h, v18.4h
50 add v1.4h, v17.4h, v19.4h
51 sub v3.4h, v16.4h, v18.4h
52 sub v2.4h, v17.4h, v19.4h
53
54 transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7
55
56 movi v29.8h, #0
57 smull v26.4s, v1.4h, v4.h[0]
58 st1 {v29.8h}, [x1], #16
59 smull v27.4s, v3.4h, v4.h[0]
60 st1 {v29.16b}, [x1]
61 sqdmulh v21.4h, v1.4h, v4.h[1]
62 sqdmulh v23.4h, v3.4h, v4.h[1]
63 sqshrn v20.4h, v26.4s, #16
64 sqshrn v22.4h, v27.4s, #16
65 add v20.4h, v20.4h, v1.4h
66 add v22.4h, v22.4h, v3.4h
67 add v16.4h, v0.4h, v2.4h
68 sub v17.4h, v0.4h, v2.4h
69
70 add v18.4h, v20.4h, v23.4h
71 ld1 {v24.d}[0], [x0], x2
72 zip1 v16.2d, v16.2d, v17.2d
73 sub v19.4h, v21.4h, v22.4h
74 ld1 {v25.d}[0], [x0], x2
75 zip1 v18.2d, v18.2d, v19.2d
76 add v0.8h, v16.8h, v18.8h
77 ld1 {v25.d}[1], [x0], x2
78 sub v1.8h, v16.8h, v18.8h
79 ld1 {v24.d}[1], [x0], x2
80 srshr v0.8h, v0.8h, #3
81 trn1 v24.4s, v24.4s, v25.4s
82 srshr v1.8h, v1.8h, #3
83 sub x0, x0, x2, lsl #2
84
85 ext v1.16b, v1.16b, v1.16b, #8
86 trn1 v3.2d, v0.2d, v1.2d
87 trn2 v0.2d, v0.2d, v1.2d
88 trn1 v1.8h, v3.8h, v0.8h
89 trn2 v3.8h, v3.8h, v0.8h
90 uzp1 v0.4s, v1.4s, v3.4s
91 uzp2 v1.4s, v3.4s, v1.4s
92
93 uaddw v0.8h, v0.8h, v24.8b
94 uaddw2 v1.8h, v1.8h, v24.16b
95 sqxtun v0.8b, v0.8h
96 sqxtun2 v0.16b, v1.8h
97 st1 {v0.s}[0], [x0], x2
98 st1 {v0.s}[1], [x0], x2
99 st1 {v0.s}[3], [x0], x2
100 st1 {v0.s}[2], [x0], x2
101
102 ret
103 endfunc
104
105 function ff_vp8_idct_dc_add4y_neon, export=1
106 movi v0.16b, #0
107 mov x3, #32
108 ld1r {v16.4h}, [x1]
109 st1 {v0.h}[0], [x1], x3
110 ld1r {v17.4h}, [x1]
111 st1 {v0.h}[0], [x1], x3
112 zip1 v16.2d, v16.2d, v17.2d
113 ld1r {v18.4h}, [x1]
114 st1 {v0.h}[0], [x1], x3
115 ld1r {v19.4h}, [x1]
116 st1 {v0.h}[0], [x1], x3
117 zip1 v18.2d, v18.2d, v19.2d
118 srshr v16.8h, v16.8h, #3 // dc >>= 3
119 ld1 {v0.16b}, [x0], x2
120 srshr v18.8h, v18.8h, #3
121 ld1 {v1.16b}, [x0], x2
122 uaddw v20.8h, v16.8h, v0.8b
123 ld1 {v2.16b}, [x0], x2
124 uaddw2 v0.8h, v18.8h, v0.16b
125 ld1 {v3.16b}, [x0], x2
126 uaddw v21.8h, v16.8h, v1.8b
127 uaddw2 v1.8h, v18.8h, v1.16b
128 uaddw v22.8h, v16.8h, v2.8b
129 uaddw2 v2.8h, v18.8h, v2.16b
130 uaddw v23.8h, v16.8h, v3.8b
131 uaddw2 v3.8h, v18.8h, v3.16b
132 sub x0, x0, x2, lsl #2
133 sqxtun v20.8b, v20.8h
134 sqxtun2 v20.16b, v0.8h
135 sqxtun v21.8b, v21.8h
136 sqxtun2 v21.16b, v1.8h
137 sqxtun v22.8b, v22.8h
138 st1 {v20.16b}, [x0], x2
139 sqxtun2 v22.16b, v2.8h
140 st1 {v21.16b}, [x0], x2
141 sqxtun v23.8b, v23.8h
142 st1 {v22.16b}, [x0], x2
143 sqxtun2 v23.16b, v3.8h
144 st1 {v23.16b}, [x0], x2
145
146 ret
147 endfunc
148
149 function ff_vp8_idct_dc_add_neon, export=1
150 mov w3, #0
151 ld1r {v2.8h}, [x1]
152 strh w3, [x1]
153 srshr v2.8h, v2.8h, #3
154 ld1 {v0.s}[0], [x0], x2
155 ld1 {v0.s}[1], [x0], x2
156 uaddw v3.8h, v2.8h, v0.8b
157 ld1 {v1.s}[0], [x0], x2
158 ld1 {v1.s}[1], [x0], x2
159 uaddw v4.8h, v2.8h, v1.8b
160 sqxtun v0.8b, v3.8h
161 sqxtun v1.8b, v4.8h
162 sub x0, x0, x2, lsl #2
163 st1 {v0.s}[0], [x0], x2
164 st1 {v0.s}[1], [x0], x2
165 st1 {v1.s}[0], [x0], x2
166 st1 {v1.s}[1], [x0], x2
167 ret
168 endfunc
169
170 // Register layout:
171 // P3..Q3 -> v0..v7
172 // flim_E -> v22
173 // flim_I -> v23
174 // hev_thresh -> x5
175 //
176 .macro vp8_loop_filter, inner=0, simple=0, hev_thresh
177 .if \simple
178 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
179 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
180 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
181 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
182 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
183 movi v21.16b, #0x80
184 cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
185 .else
186 // calculate hev and normal_limit:
187 uabd v20.16b, v2.16b, v3.16b // abs(P1-P0)
188 uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0)
189 uabd v18.16b, v0.16b, v1.16b // abs(P3-P2)
190 uabd v19.16b, v1.16b, v2.16b // abs(P2-P1)
191 cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I
192 cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I
193 cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I
194 cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I
195 and v16.16b, v17.16b, v16.16b
196 uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2)
197 and v16.16b, v16.16b, v19.16b
198 uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1)
199 and v16.16b, v16.16b, v18.16b
200 cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I
201 cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I
202 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0)
203 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1)
204 and v16.16b, v16.16b, v18.16b
205 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2
206 and v16.16b, v16.16b, v19.16b
207 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2
208 dup v23.16b, \hev_thresh // hev_thresh
209 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
210 cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh
211 cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
212 cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh
213 and v16.16b, v16.16b, v19.16b
214 movi v21.16b, #0x80
215 orr v17.16b, v20.16b, v22.16b
216 .endif
217
218 // at this point:
219 // v16: normal_limit
220 // v17: hev
221
222 // convert to signed value:
223 eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
224 eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
225
226 movi v20.8h, #3
227 ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
228 ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
229 eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
230 eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
231 mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
232 mul v19.8h, v19.8h, v20.8h
233
234 sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
235 movi v22.16b, #4
236 movi v23.16b, #3
237 .if \inner
238 and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
239 .endif
240 saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
241 saddw2 v19.8h, v19.8h, v20.16b
242 sqxtn v18.8b, v18.8h // narrow result back into v18
243 sqxtn2 v18.16b, v19.8h
244 .if !\inner && !\simple
245 eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
246 eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
247 .endif
248 and v18.16b, v18.16b, v16.16b // w &= normal_limit
249
250 // registers used at this point..
251 // v0 -> P3 (don't corrupt)
252 // v1-v6 -> PS2-QS2
253 // v7 -> Q3 (don't corrupt)
254 // v17 -> hev
255 // v18 -> w
256 // v21 -> #0x80
257 // v22 -> #4
258 // v23 -> #3
259 // v16, v19, v29 -> unused
260 //
261 // filter_common: is4tap==1
262 // c1 = clamp(w + 4) >> 3;
263 // c2 = clamp(w + 3) >> 3;
264 // Q0 = s2u(QS0 - c1);
265 // P0 = s2u(PS0 + c2);
266
267 .if \simple
268 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
269 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
270 sshr v19.16b, v19.16b, #3 // c1 >>= 3
271 sshr v20.16b, v20.16b, #3 // c2 >>= 3
272 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
273 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
274 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
275 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
276 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
277 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
278 .elseif \inner
279 // the !is4tap case of filter_common, only used for inner blocks
280 // c3 = ((c1&~hev) + 1) >> 1;
281 // Q1 = s2u(QS1 - c3);
282 // P1 = s2u(PS1 + c3);
283 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
284 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
285 sshr v19.16b, v19.16b, #3 // c1 >>= 3
286 sshr v20.16b, v20.16b, #3 // c2 >>= 3
287 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
288 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
289 bic v19.16b, v19.16b, v17.16b // c1 & ~hev
290 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
291 srshr v19.16b, v19.16b, #1 // c3 >>= 1
292 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
293 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
294 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
295 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
296 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
297 .else
298 and v20.16b, v18.16b, v17.16b // w & hev
299 sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
300 sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
301 sshr v19.16b, v19.16b, #3 // c1 >>= 3
302 sshr v20.16b, v20.16b, #3 // c2 >>= 3
303 bic v18.16b, v18.16b, v17.16b // w &= ~hev
304 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
305 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
306
307 // filter_mbedge:
308 // a = clamp((27*w + 63) >> 7);
309 // Q0 = s2u(QS0 - a);
310 // P0 = s2u(PS0 + a);
311 // a = clamp((18*w + 63) >> 7);
312 // Q1 = s2u(QS1 - a);
313 // P1 = s2u(PS1 + a);
314 // a = clamp((9*w + 63) >> 7);
315 // Q2 = s2u(QS2 - a);
316 // P2 = s2u(PS2 + a);
317 movi v17.8h, #63
318 sshll v22.8h, v18.8b, #3
319 sshll2 v23.8h, v18.16b, #3
320 saddw v22.8h, v22.8h, v18.8b
321 saddw2 v23.8h, v23.8h, v18.16b
322 add v16.8h, v17.8h, v22.8h
323 add v17.8h, v17.8h, v23.8h // 9*w + 63
324 add v19.8h, v16.8h, v22.8h
325 add v20.8h, v17.8h, v23.8h // 18*w + 63
326 add v22.8h, v19.8h, v22.8h
327 add v23.8h, v20.8h, v23.8h // 27*w + 63
328 sqshrn v16.8b, v16.8h, #7
329 sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
330 sqshrn v19.8b, v19.8h, #7
331 sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
332 sqshrn v22.8b, v22.8h, #7
333 sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
334 sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
335 sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
336 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
337 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
338 sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
339 sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
340 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
341 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
342 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
343 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
344 eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
345 eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
346 .endif
347 .endm
348
349 .macro vp8_v_loop_filter16 name, inner=0, simple=0
350 function ff_vp8_v_loop_filter16\name\()_neon, export=1
351 sub x0, x0, x1, lsl #1+!\simple
352
353 // Load pixels:
354 .if !\simple
355 ld1 {v0.16b}, [x0], x1 // P3
356 ld1 {v1.16b}, [x0], x1 // P2
357 .endif
358 ld1 {v2.16b}, [x0], x1 // P1
359 ld1 {v3.16b}, [x0], x1 // P0
360 ld1 {v4.16b}, [x0], x1 // Q0
361 ld1 {v5.16b}, [x0], x1 // Q1
362 .if !\simple
363 ld1 {v6.16b}, [x0], x1 // Q2
364 ld1 {v7.16b}, [x0] // Q3
365 dup v23.16b, w3 // flim_I
366 .endif
367 dup v22.16b, w2 // flim_E
368
369 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
370
371 // back up to P2: dst -= stride * 6
372 sub x0, x0, x1, lsl #2
373 .if !\simple
374 sub x0, x0, x1, lsl #1
375
376 // Store pixels:
377 st1 {v1.16b}, [x0], x1 // P2
378 .endif
379 st1 {v2.16b}, [x0], x1 // P1
380 st1 {v3.16b}, [x0], x1 // P0
381 st1 {v4.16b}, [x0], x1 // Q0
382 st1 {v5.16b}, [x0], x1 // Q1
383 .if !\simple
384 st1 {v6.16b}, [x0] // Q2
385 .endif
386
387 ret
388 endfunc
389 .endm
390
391 vp8_v_loop_filter16
392 vp8_v_loop_filter16 _inner, inner=1
393 vp8_v_loop_filter16 _simple, simple=1
394
395 .macro vp8_v_loop_filter8uv name, inner=0
396 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
397 sub x0, x0, x2, lsl #2
398 sub x1, x1, x2, lsl #2
399 // Load pixels:
400 ld1 {v0.d}[0], [x0], x2 // P3
401 ld1 {v0.d}[1], [x1], x2 // P3
402 ld1 {v1.d}[0], [x0], x2 // P2
403 ld1 {v1.d}[1], [x1], x2 // P2
404 ld1 {v2.d}[0], [x0], x2 // P1
405 ld1 {v2.d}[1], [x1], x2 // P1
406 ld1 {v3.d}[0], [x0], x2 // P0
407 ld1 {v3.d}[1], [x1], x2 // P0
408 ld1 {v4.d}[0], [x0], x2 // Q0
409 ld1 {v4.d}[1], [x1], x2 // Q0
410 ld1 {v5.d}[0], [x0], x2 // Q1
411 ld1 {v5.d}[1], [x1], x2 // Q1
412 ld1 {v6.d}[0], [x0], x2 // Q2
413 ld1 {v6.d}[1], [x1], x2 // Q2
414 ld1 {v7.d}[0], [x0] // Q3
415 ld1 {v7.d}[1], [x1] // Q3
416
417 dup v22.16b, w3 // flim_E
418 dup v23.16b, w4 // flim_I
419
420 vp8_loop_filter inner=\inner, hev_thresh=w5
421
422 // back up to P2: u,v -= stride * 6
423 sub x0, x0, x2, lsl #2
424 sub x1, x1, x2, lsl #2
425 sub x0, x0, x2, lsl #1
426 sub x1, x1, x2, lsl #1
427
428 // Store pixels:
429
430 st1 {v1.d}[0], [x0], x2 // P2
431 st1 {v1.d}[1], [x1], x2 // P2
432 st1 {v2.d}[0], [x0], x2 // P1
433 st1 {v2.d}[1], [x1], x2 // P1
434 st1 {v3.d}[0], [x0], x2 // P0
435 st1 {v3.d}[1], [x1], x2 // P0
436 st1 {v4.d}[0], [x0], x2 // Q0
437 st1 {v4.d}[1], [x1], x2 // Q0
438 st1 {v5.d}[0], [x0], x2 // Q1
439 st1 {v5.d}[1], [x1], x2 // Q1
440 st1 {v6.d}[0], [x0] // Q2
441 st1 {v6.d}[1], [x1] // Q2
442
443 ret
444 endfunc
445 .endm
446
447 vp8_v_loop_filter8uv
448 vp8_v_loop_filter8uv _inner, inner=1
449
450 .macro vp8_h_loop_filter16 name, inner=0, simple=0
451 function ff_vp8_h_loop_filter16\name\()_neon, export=1
452
453 sub x0, x0, #4
454 // Load pixels:
455 ld1 {v0.d}[0], [x0], x1
456 ld1 {v1.d}[0], [x0], x1
457 ld1 {v2.d}[0], [x0], x1
458 ld1 {v3.d}[0], [x0], x1
459 ld1 {v4.d}[0], [x0], x1
460 ld1 {v5.d}[0], [x0], x1
461 ld1 {v6.d}[0], [x0], x1
462 ld1 {v7.d}[0], [x0], x1
463 ld1 {v0.d}[1], [x0], x1
464 ld1 {v1.d}[1], [x0], x1
465 ld1 {v2.d}[1], [x0], x1
466 ld1 {v3.d}[1], [x0], x1
467 ld1 {v4.d}[1], [x0], x1
468 ld1 {v5.d}[1], [x0], x1
469 ld1 {v6.d}[1], [x0], x1
470 ld1 {v7.d}[1], [x0], x1
471
472 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
473
474 dup v22.16b, w2 // flim_E
475 .if !\simple
476 dup v23.16b, w3 // flim_I
477 .endif
478
479 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
480
481 sub x0, x0, x1, lsl #4 // backup 16 rows
482
483 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
484
485 // Store pixels:
486 st1 {v0.d}[0], [x0], x1
487 st1 {v1.d}[0], [x0], x1
488 st1 {v2.d}[0], [x0], x1
489 st1 {v3.d}[0], [x0], x1
490 st1 {v4.d}[0], [x0], x1
491 st1 {v5.d}[0], [x0], x1
492 st1 {v6.d}[0], [x0], x1
493 st1 {v7.d}[0], [x0], x1
494 st1 {v0.d}[1], [x0], x1
495 st1 {v1.d}[1], [x0], x1
496 st1 {v2.d}[1], [x0], x1
497 st1 {v3.d}[1], [x0], x1
498 st1 {v4.d}[1], [x0], x1
499 st1 {v5.d}[1], [x0], x1
500 st1 {v6.d}[1], [x0], x1
501 st1 {v7.d}[1], [x0]
502
503 ret
504 endfunc
505 .endm
506
507 vp8_h_loop_filter16
508 vp8_h_loop_filter16 _inner, inner=1
509 vp8_h_loop_filter16 _simple, simple=1
510
511 .macro vp8_h_loop_filter8uv name, inner=0
512 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
513 sub x0, x0, #4
514 sub x1, x1, #4
515
516 // Load pixels:
517 ld1 {v0.d}[0], [x0], x2 // load u
518 ld1 {v0.d}[1], [x1], x2 // load v
519 ld1 {v1.d}[0], [x0], x2
520 ld1 {v1.d}[1], [x1], x2
521 ld1 {v2.d}[0], [x0], x2
522 ld1 {v2.d}[1], [x1], x2
523 ld1 {v3.d}[0], [x0], x2
524 ld1 {v3.d}[1], [x1], x2
525 ld1 {v4.d}[0], [x0], x2
526 ld1 {v4.d}[1], [x1], x2
527 ld1 {v5.d}[0], [x0], x2
528 ld1 {v5.d}[1], [x1], x2
529 ld1 {v6.d}[0], [x0], x2
530 ld1 {v6.d}[1], [x1], x2
531 ld1 {v7.d}[0], [x0], x2
532 ld1 {v7.d}[1], [x1], x2
533
534 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
535
536 dup v22.16b, w3 // flim_E
537 dup v23.16b, w4 // flim_I
538
539 vp8_loop_filter inner=\inner, hev_thresh=w5
540
541 sub x0, x0, x2, lsl #3 // backup u 8 rows
542 sub x1, x1, x2, lsl #3 // backup v 8 rows
543
544 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
545
546 // Store pixels:
547 st1 {v0.d}[0], [x0], x2 // load u
548 st1 {v0.d}[1], [x1], x2 // load v
549 st1 {v1.d}[0], [x0], x2
550 st1 {v1.d}[1], [x1], x2
551 st1 {v2.d}[0], [x0], x2
552 st1 {v2.d}[1], [x1], x2
553 st1 {v3.d}[0], [x0], x2
554 st1 {v3.d}[1], [x1], x2
555 st1 {v4.d}[0], [x0], x2
556 st1 {v4.d}[1], [x1], x2
557 st1 {v5.d}[0], [x0], x2
558 st1 {v5.d}[1], [x1], x2
559 st1 {v6.d}[0], [x0], x2
560 st1 {v6.d}[1], [x1], x2
561 st1 {v7.d}[0], [x0]
562 st1 {v7.d}[1], [x1]
563
564 ret
565
566 endfunc
567 .endm
568
569 vp8_h_loop_filter8uv
570 vp8_h_loop_filter8uv _inner, inner=1
571
572
573 function ff_put_vp8_pixels16_neon, export=1
574 1:
575 subs w4, w4, #4
576 ld1 {v0.16b}, [x2], x3
577 ld1 {v1.16b}, [x2], x3
578 ld1 {v2.16b}, [x2], x3
579 ld1 {v3.16b}, [x2], x3
580 st1 {v0.16b}, [x0], x1
581 st1 {v1.16b}, [x0], x1
582 st1 {v2.16b}, [x0], x1
583 st1 {v3.16b}, [x0], x1
584 bgt 1b
585 ret
586 endfunc
587
588 function ff_put_vp8_pixels8_neon, export=1
589 1:
590 subs w4, w4, #4
591 ld1 {v0.8b}, [x2], x3
592 ld1 {v0.d}[1], [x2], x3
593 ld1 {v1.8b}, [x2], x3
594 ld1 {v1.d}[1], [x2], x3
595 st1 {v0.8b}, [x0], x1
596 st1 {v0.d}[1], [x0], x1
597 st1 {v1.8b}, [x0], x1
598 st1 {v1.d}[1], [x0], x1
599 bgt 1b
600 ret
601 endfunc
602
603 /* 4/6-tap 8th-pel MC */
604
605 .macro vp8_epel8_h6 d, s0, s1
606 ext v22.8b, \s0\().8b, \s1\().8b, #1
607 uxtl v18.8h, \s0\().8b
608 ext v23.8b, \s0\().8b, \s1\().8b, #2
609 uxtl v19.8h, v22.8b
610 ext v24.8b, \s0\().8b, \s1\().8b, #3
611 uxtl v21.8h, v23.8b
612 ext v25.8b, \s0\().8b, \s1\().8b, #4
613 uxtl v22.8h, v24.8b
614 ext v26.8b, \s0\().8b, \s1\().8b, #5
615 uxtl v25.8h, v25.8b
616 mul v21.8h, v21.8h, v0.h[2]
617 uxtl v26.8h, v26.8b
618 mul v22.8h, v22.8h, v0.h[3]
619 mls v21.8h, v19.8h, v0.h[1]
620 mls v22.8h, v25.8h, v0.h[4]
621 mla v21.8h, v18.8h, v0.h[0]
622 mla v22.8h, v26.8h, v0.h[5]
623 sqadd v22.8h, v21.8h, v22.8h
624 sqrshrun \d\().8b, v22.8h, #7
625 .endm
626
627 .macro vp8_epel16_h6 d0, v0, v1
628 ext v22.16b, \v0\().16b, \v1\().16b, #3
629 ext v23.16b, \v0\().16b, \v1\().16b, #4
630 uxtl v19.8h, v22.8b
631 uxtl2 v22.8h, v22.16b
632 ext v3.16b, \v0\().16b, \v1\().16b, #2
633 uxtl v20.8h, v23.8b
634 uxtl2 v23.8h, v23.16b
635 ext v16.16b, \v0\().16b, \v1\().16b, #1
636 uxtl v18.8h, v3.8b
637 uxtl2 v3.8h, v3.16b
638 ext v2.16b, \v0\().16b, \v1\().16b, #5
639 uxtl v21.8h, v2.8b
640 uxtl2 v2.8h, v2.16b
641 uxtl v17.8h, v16.8b
642 uxtl2 v16.8h, v16.16b
643 mul v19.8h, v19.8h, v0.h[3]
644 mul v18.8h, v18.8h, v0.h[2]
645 mul v3.8h, v3.8h, v0.h[2]
646 mul v22.8h, v22.8h, v0.h[3]
647 mls v19.8h, v20.8h, v0.h[4]
648 uxtl v20.8h, \v0\().8b
649 uxtl2 v1.8h, \v0\().16b
650 mls v18.8h, v17.8h, v0.h[1]
651 mls v3.8h, v16.8h, v0.h[1]
652 mls v22.8h, v23.8h, v0.h[4]
653 mla v18.8h, v20.8h, v0.h[0]
654 mla v19.8h, v21.8h, v0.h[5]
655 mla v3.8h, v1.8h, v0.h[0]
656 mla v22.8h, v2.8h, v0.h[5]
657 sqadd v19.8h, v18.8h, v19.8h
658 sqadd v22.8h, v3.8h, v22.8h
659 sqrshrun \d0\().8b, v19.8h, #7
660 sqrshrun2 \d0\().16b, v22.8h, #7
661 .endm
662
663 .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
664 uxtl \s2\().8h, \s2\().8b
665 uxtl \s3\().8h, \s3\().8b
666 uxtl \s1\().8h, \s1\().8b
667 uxtl \s4\().8h, \s4\().8b
668 uxtl \s0\().8h, \s0\().8b
669 uxtl \s5\().8h, \s5\().8b
670 mul \s2\().8h, \s2\().8h, v0.h[2]
671 mul \s3\().8h, \s3\().8h, v0.h[3]
672 mls \s2\().8h, \s1\().8h, v0.h[1]
673 mls \s3\().8h, \s4\().8h, v0.h[4]
674 mla \s2\().8h, \s0\().8h, v0.h[0]
675 mla \s3\().8h, \s5\().8h, v0.h[5]
676 sqadd \s3\().8h, \s2\().8h, \s3\().8h
677 sqrshrun \d0\().8b, \s3\().8h, #7
678 .endm
679
680 .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
681 uxtl \s0\().8h, \s0\().8b
682 uxtl \s3\().8h, \s3\().8b
683 uxtl \s6\().8h, \s6\().8b
684 uxtl \s1\().8h, \s1\().8b
685 uxtl \s4\().8h, \s4\().8b
686 uxtl \s2\().8h, \s2\().8b
687 uxtl \s5\().8h, \s5\().8b
688 mul \s0\().8h, \s0\().8h, v0.h[0]
689 mul v31.8h , \s3\().8h, v0.h[3]
690 mul \s3\().8h, \s3\().8h, v0.h[2]
691 mul \s6\().8h, \s6\().8h, v0.h[5]
692
693 mls \s0\().8h, \s1\().8h, v0.h[1]
694 mls v31.8h , \s4\().8h, v0.h[4]
695 mls \s3\().8h, \s2\().8h, v0.h[1]
696 mls \s6\().8h, \s5\().8h, v0.h[4]
697
698 mla \s0\().8h, \s2\().8h, v0.h[2]
699 mla v31.8h , \s5\().8h, v0.h[5]
700 mla \s3\().8h, \s1\().8h, v0.h[0]
701 mla \s6\().8h, \s4\().8h, v0.h[3]
702 sqadd v31.8h , \s0\().8h, v31.8h
703 sqadd \s6\().8h, \s3\().8h, \s6\().8h
704 sqrshrun \d0\().8b, v31.8h, #7
705 sqrshrun \d1\().8b, \s6\().8h, #7
706 .endm
707
708 .macro vp8_epel8_h4 d, v0, v1
709 ext v22.8b, \v0\().8b, \v1\().8b, #1
710 uxtl v19.8h, \v0\().8b
711 ext v23.8b, \v0\().8b, \v1\().8b, #2
712 uxtl v20.8h, v22.8b
713 ext v25.8b, \v0\().8b, \v1\().8b, #3
714 uxtl v22.8h, v23.8b
715 uxtl v25.8h, v25.8b
716 mul v20.8h, v20.8h, v0.h[2]
717 mul v22.8h, v22.8h, v0.h[3]
718 mls v20.8h, v19.8h, v0.h[1]
719 mls v22.8h, v25.8h, v0.h[4]
720 sqadd v22.8h, v20.8h, v22.8h
721 sqrshrun \d\().8b, v22.8h, #7
722 .endm
723
724 .macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
725 uxtl \s0\().8h, \s0\().8b
726 uxtl \s1\().8h, \s1\().8b
727 uxtl \s2\().8h, \s2\().8b
728 uxtl \s3\().8h, \s3\().8b
729 uxtl \s4\().8h, \s4\().8b
730 mul v21.8h, \s1\().8h, v0.h[2]
731 mul v23.8h, \s2\().8h, v0.h[3]
732 mul \s2\().8h, \s2\().8h, v0.h[2]
733 mul v22.8h, \s3\().8h, v0.h[3]
734 mls v21.8h, \s0\().8h, v0.h[1]
735 mls v23.8h, \s3\().8h, v0.h[4]
736 mls \s2\().8h, \s1\().8h, v0.h[1]
737 mls v22.8h, \s4\().8h, v0.h[4]
738 sqadd v21.8h, v21.8h, v23.8h
739 sqadd \s2\().8h, \s2\().8h, v22.8h
740 sqrshrun \d0\().8b, v21.8h, #7
741 sqrshrun2 \d0\().16b, \s2\().8h, #7
742 .endm
743
744
745 // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
746 // arithmatic can be used to apply filters
747 const subpel_filters, align=4
748 .short 0, 6, 123, 12, 1, 0, 0, 0
749 .short 2, 11, 108, 36, 8, 1, 0, 0
750 .short 0, 9, 93, 50, 6, 0, 0, 0
751 .short 3, 16, 77, 77, 16, 3, 0, 0
752 .short 0, 6, 50, 93, 9, 0, 0, 0
753 .short 1, 8, 36, 108, 11, 2, 0, 0
754 .short 0, 1, 12, 123, 6, 0, 0, 0
755 endconst
756
757 function ff_put_vp8_epel16_v6_neon, export=1
758 sub x2, x2, x3, lsl #1
759
760 sxtw x4, w4
761 sxtw x6, w6
762 movrel x17, subpel_filters, -16
763 add x6, x17, x6, lsl #4 // y
764 ld1 {v0.8h}, [x6]
765 1:
766 ld1 {v1.1d - v2.1d}, [x2], x3
767 ld1 {v3.1d - v4.1d}, [x2], x3
768 ld1 {v16.1d - v17.1d}, [x2], x3
769 ld1 {v18.1d - v19.1d}, [x2], x3
770 ld1 {v20.1d - v21.1d}, [x2], x3
771 ld1 {v22.1d - v23.1d}, [x2], x3
772 ld1 {v24.1d - v25.1d}, [x2]
773 sub x2, x2, x3, lsl #2
774
775 vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
776 vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
777
778 st1 {v1.1d - v2.1d}, [x0], x1
779 st1 {v3.1d - v4.1d}, [x0], x1
780 subs x4, x4, #2
781 bne 1b
782
783 ret
784 endfunc
785
786 function ff_put_vp8_epel16_h6_neon, export=1
787 sub x2, x2, #2
788 sxtw x5, w5 // x
789
790 // first pass (horizontal):
791 movrel x17, subpel_filters, -16
792 add x5, x17, x5, lsl #4 // x
793 ld1 {v0.8h}, [x5]
794 1:
795 ld1 {v1.16b, v2.16b}, [x2], x3
796 vp8_epel16_h6 v1, v1, v2
797 st1 {v1.16b}, [x0], x1
798
799 subs w4, w4, #1
800 bne 1b
801 ret
802 endfunc
803
804
805 function ff_put_vp8_epel16_h6v6_neon, export=1
806 sub x2, x2, x3, lsl #1
807 sub x2, x2, #2
808
809 // first pass (horizontal):
810 movrel x17, subpel_filters, -16
811 sxtw x5, w5 // x
812 add x16, x17, x5, lsl #4 // x
813 sub sp, sp, #336+16
814 ld1 {v0.8h}, [x16]
815 add x7, sp, #15
816 sxtw x4, w4
817 add x16, x4, #5 // h
818 bic x7, x7, #15
819 1:
820 ld1 {v1.16b, v2.16b}, [x2], x3
821 vp8_epel16_h6 v1, v1, v2
822 st1 {v1.16b}, [x7], #16
823 subs x16, x16, #1
824 bne 1b
825
826
827 // second pass (vertical):
828 sxtw x6, w6
829 add x6, x17, x6, lsl #4 // y
830 add x7, sp, #15
831 ld1 {v0.8h}, [x6]
832 bic x7, x7, #15
833 2:
834 ld1 {v1.8b - v4.8b}, [x7], #32
835 ld1 {v16.8b - v19.8b}, [x7], #32
836 ld1 {v20.8b - v23.8b}, [x7]
837 sub x7, x7, #48
838
839 vp8_epel8_v6 v5, v1, v3, v16, v18, v20, v22
840 vp8_epel8_v6 v2, v2, v4, v17, v19, v21, v23
841 trn1 v2.2d, v5.2d, v2.2d
842
843 st1 {v2.16b}, [x0], x1
844 subs x4, x4, #1
845 bne 2b
846
847 add sp, sp, #336+16
848 ret
849 endfunc
850
851 function ff_put_vp8_epel8_h6v6_neon, export=1
852 sub x2, x2, x3, lsl #1
853 sub x2, x2, #2
854 sxtw x4, w4
855
856 // first pass (horizontal):
857 movrel x17, subpel_filters, -16
858 sxtw x5, w5
859 add x5, x17, x5, lsl #4 // x
860 sub sp, sp, #168+16
861 ld1 {v0.8h}, [x5]
862 add x7, sp, #15
863 add x16, x4, #5 // h
864 bic x7, x7, #15
865 1:
866 ld1 {v1.8b, v2.8b}, [x2], x3
867
868 vp8_epel8_h6 v1, v1, v2
869
870 st1 {v1.8b}, [x7], #8
871 subs x16, x16, #1
872 bne 1b
873
874 // second pass (vertical):
875 sxtw x6, w6
876 add x6, x17, x6, lsl #4 // y
877 add x7, sp, #15
878 ld1 {v0.8h}, [x6]
879 bic x7, x7, #15
880 2:
881 ld1 {v1.8b - v4.8b}, [x7], #32
882 ld1 {v5.8b - v7.8b}, [x7]
883
884 sub x7, x7, #16
885
886 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
887
888 st1 {v1.8b}, [x0], x1
889 st1 {v2.8b}, [x0], x1
890 subs x4, x4, #2
891 bne 2b
892
893 add sp, sp, #168+16
894 ret
895 endfunc
896
897 function ff_put_vp8_epel8_h4v6_neon, export=1
898 sub x2, x2, x3, lsl #1
899 sub x2, x2, #1
900 sxtw x4, w4
901
902 // first pass (horizontal):
903 movrel x17, subpel_filters, -16
904 sxtw x5, w5
905 add x5, x17, x5, lsl #4 // x
906 sub sp, sp, #168+16
907 ld1 {v0.8h}, [x5]
908 add x7, sp, #15
909 add x16, x4, #5 // h
910 bic x7, x7, #15
911 1:
912 ld1 {v1.8b, v2.8b}, [x2], x3
913
914 vp8_epel8_h4 v1, v1, v2
915
916 st1 {v1.8b}, [x7], #8
917 subs x16, x16, #1
918 bne 1b
919
920 // second pass (vertical):
921 sxtw x6, w6
922 add x6, x17, x6, lsl #4 // y
923 add x7, sp, #15
924 ld1 {v0.8h}, [x6]
925 bic x7, x7, #15
926 2:
927 ld1 {v1.8b - v4.8b}, [x7], #32
928 ld1 {v5.8b - v7.8b}, [x7]
929
930 sub x7, x7, #16
931
932 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
933
934 st1 {v1.8b}, [x0], x1
935 st1 {v2.8b}, [x0], x1
936 subs x4, x4, #2
937 bne 2b
938
939 add sp, sp, #168+16
940 ret
941 endfunc
942
943 function ff_put_vp8_epel8_h4v4_neon, export=1
944 sub x2, x2, x3
945 sub x2, x2, #1
946 sxtw x4, w4
947
948
949 // first pass (horizontal):
950 movrel x17, subpel_filters, -16
951 sxtw x5, w5
952 add x5, x17, x5, lsl #4 // x
953 sub sp, sp, #168+16
954 ld1 {v0.8h}, [x5]
955 add x7, sp, #15
956 add x16, x4, #3 // h
957 bic x7, x7, #15
958 1:
959 ld1 {v1.8b, v2.8b}, [x2], x3
960
961 vp8_epel8_h4 v1, v1, v2
962
963 st1 {v1.8b}, [x7], #8
964 subs x16, x16, #1
965 bne 1b
966
967 // second pass (vertical):
968 sxtw x6, w6
969 add x6, x17, x6, lsl #4 // y
970 add x7, sp, #15
971 ld1 {v0.8h}, [x6]
972 bic x7, x7, #15
973 2:
974 ld1 {v1.8b - v2.8b}, [x7], #16
975 ld1 {v3.8b - v5.8b}, [x7]
976
977 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
978
979 st1 {v1.d}[0], [x0], x1
980 st1 {v1.d}[1], [x0], x1
981 subs x4, x4, #2
982 bne 2b
983
984 add sp, sp, #168+16
985 ret
986 endfunc
987
988 function ff_put_vp8_epel8_h6v4_neon, export=1
989 sub x2, x2, x3
990 sub x2, x2, #2
991 sxtw x4, w4
992
993
994 // first pass (horizontal):
995 movrel x17, subpel_filters, -16
996 sxtw x5, w5
997 add x5, x17, x5, lsl #4 // x
998 sub sp, sp, #168+16
999 ld1 {v0.8h}, [x5]
1000 add x7, sp, #15
1001 add x16, x4, #3 // h
1002 bic x7, x7, #15
1003 1:
1004 ld1 {v1.8b, v2.8b}, [x2], x3
1005
1006 vp8_epel8_h6 v1, v1, v2
1007
1008 st1 {v1.8b}, [x7], #8
1009 subs x16, x16, #1
1010 bne 1b
1011
1012 // second pass (vertical):
1013 sxtw x6, w6
1014 add x6, x17, x6, lsl #4 // y
1015 add x7, sp, #15
1016 ld1 {v0.8h}, [x6]
1017 bic x7, x7, #15
1018 2:
1019 ld1 {v1.8b - v2.8b}, [x7], #16
1020 ld1 {v3.8b - v5.8b}, [x7]
1021
1022 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1023
1024 st1 {v1.d}[0], [x0], x1
1025 st1 {v1.d}[1], [x0], x1
1026 subs x4, x4, #2
1027 bne 2b
1028
1029 add sp, sp, #168+16
1030 ret
1031 endfunc