h264/aarch64: sign extend int stride in loop filter asm
[libav.git] / libavcodec / aarch64 / h264dsp_neon.S
CommitLineData
36e3b1f2
JG
1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of Libav.
6 *
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23#include "neon.S"
24
25.macro h264_loop_filter_start
26 cmp w2, #0
27 ldr w6, [x4]
28 ccmp w3, #0, #0, ne
29 mov v24.S[0], w6
30 and w6, w6, w6, lsl #16
31 b.eq 1f
32 ands w6, w6, w6, lsl #8
33 b.ge 2f
341:
35 ret
362:
37.endm
38
39.macro h264_loop_filter_luma
40 dup v22.16B, w2 // alpha
41 uxtl v24.8H, v24.8B
42 uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
43 uxtl v24.4S, v24.4H
44 uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
45 sli v24.8H, v24.8H, #8
46 uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
47 sli v24.4S, v24.4S, #16
48 cmhi v21.16B, v22.16B, v21.16B // < alpha
49 dup v22.16B, w3 // beta
50 cmlt v23.16B, v24.16B, #0
51 cmhi v28.16B, v22.16B, v28.16B // < beta
52 cmhi v30.16B, v22.16B, v30.16B // < beta
53 bic v21.16B, v21.16B, v23.16B
54 uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
55 and v21.16B, v21.16B, v28.16B
56 uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
57 cmhi v17.16B, v22.16B, v17.16B // < beta
58 and v21.16B, v21.16B, v30.16B
59 cmhi v19.16B, v22.16B, v19.16B // < beta
60 and v17.16B, v17.16B, v21.16B
61 and v19.16B, v19.16B, v21.16B
62 and v24.16B, v24.16B, v21.16B
63 urhadd v28.16B, v16.16B, v0.16B
64 sub v21.16B, v24.16B, v17.16B
65 uqadd v23.16B, v18.16B, v24.16B
66 uhadd v20.16B, v20.16B, v28.16B
67 sub v21.16B, v21.16B, v19.16B
68 uhadd v28.16B, v4.16B, v28.16B
69 umin v23.16B, v23.16B, v20.16B
70 uqsub v22.16B, v18.16B, v24.16B
71 uqadd v4.16B, v2.16B, v24.16B
72 umax v23.16B, v23.16B, v22.16B
73 uqsub v22.16B, v2.16B, v24.16B
74 umin v28.16B, v4.16B, v28.16B
75 uxtl v4.8H, v0.8B
76 umax v28.16B, v28.16B, v22.16B
77 uxtl2 v20.8H, v0.16B
78 usubw v4.8H, v4.8H, v16.8B
79 usubw2 v20.8H, v20.8H, v16.16B
80 shl v4.8H, v4.8H, #2
81 shl v20.8H, v20.8H, #2
82 uaddw v4.8H, v4.8H, v18.8B
83 uaddw2 v20.8H, v20.8H, v18.16B
84 usubw v4.8H, v4.8H, v2.8B
85 usubw2 v20.8H, v20.8H, v2.16B
86 rshrn v4.8B, v4.8H, #3
87 rshrn2 v4.16B, v20.8H, #3
88 bsl v17.16B, v23.16B, v18.16B
89 bsl v19.16B, v28.16B, v2.16B
90 neg v23.16B, v21.16B
91 uxtl v28.8H, v16.8B
92 smin v4.16B, v4.16B, v21.16B
93 uxtl2 v21.8H, v16.16B
94 smax v4.16B, v4.16B, v23.16B
95 uxtl v22.8H, v0.8B
96 uxtl2 v24.8H, v0.16B
97 saddw v28.8H, v28.8H, v4.8B
98 saddw2 v21.8H, v21.8H, v4.16B
99 ssubw v22.8H, v22.8H, v4.8B
100 ssubw2 v24.8H, v24.8H, v4.16B
101 sqxtun v16.8B, v28.8H
102 sqxtun2 v16.16B, v21.8H
103 sqxtun v0.8B, v22.8H
104 sqxtun2 v0.16B, v24.8H
105.endm
106
107function ff_h264_v_loop_filter_luma_neon, export=1
108 h264_loop_filter_start
109 sxtw x1, w1
110
111 ld1 {v0.16B}, [x0], x1
112 ld1 {v2.16B}, [x0], x1
113 ld1 {v4.16B}, [x0], x1
114 sub x0, x0, x1, lsl #2
115 sub x0, x0, x1, lsl #1
116 ld1 {v20.16B}, [x0], x1
117 ld1 {v18.16B}, [x0], x1
118 ld1 {v16.16B}, [x0], x1
119
120 h264_loop_filter_luma
121
122 sub x0, x0, x1, lsl #1
123 st1 {v17.16B}, [x0], x1
124 st1 {v16.16B}, [x0], x1
125 st1 {v0.16B}, [x0], x1
126 st1 {v19.16B}, [x0]
127
128 ret
129endfunc
130
131function ff_h264_h_loop_filter_luma_neon, export=1
132 h264_loop_filter_start
bb515e3a 133 sxtw x1, w1
36e3b1f2
JG
134
135 sub x0, x0, #4
136 ld1 {v6.8B}, [x0], x1
137 ld1 {v20.8B}, [x0], x1
138 ld1 {v18.8B}, [x0], x1
139 ld1 {v16.8B}, [x0], x1
140 ld1 {v0.8B}, [x0], x1
141 ld1 {v2.8B}, [x0], x1
142 ld1 {v4.8B}, [x0], x1
143 ld1 {v26.8B}, [x0], x1
144 ld1 {v6.D}[1], [x0], x1
145 ld1 {v20.D}[1], [x0], x1
146 ld1 {v18.D}[1], [x0], x1
147 ld1 {v16.D}[1], [x0], x1
148 ld1 {v0.D}[1], [x0], x1
149 ld1 {v2.D}[1], [x0], x1
150 ld1 {v4.D}[1], [x0], x1
151 ld1 {v26.D}[1], [x0], x1
152
153 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
154
155 h264_loop_filter_luma
156
157 transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
158
159 sub x0, x0, x1, lsl #4
160 add x0, x0, #2
161 st1 {v17.S}[0], [x0], x1
162 st1 {v16.S}[0], [x0], x1
163 st1 {v0.S}[0], [x0], x1
164 st1 {v19.S}[0], [x0], x1
165 st1 {v17.S}[1], [x0], x1
166 st1 {v16.S}[1], [x0], x1
167 st1 {v0.S}[1], [x0], x1
168 st1 {v19.S}[1], [x0], x1
169 st1 {v17.S}[2], [x0], x1
170 st1 {v16.S}[2], [x0], x1
171 st1 {v0.S}[2], [x0], x1
172 st1 {v19.S}[2], [x0], x1
173 st1 {v17.S}[3], [x0], x1
174 st1 {v16.S}[3], [x0], x1
175 st1 {v0.S}[3], [x0], x1
176 st1 {v19.S}[3], [x0], x1
177
178 ret
179endfunc
180
181.macro h264_loop_filter_chroma
182 dup v22.8B, w2 // alpha
183 uxtl v24.8H, v24.8B
184 uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
185 uxtl v4.8H, v0.8B
186 uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
187 usubw v4.8H, v4.8H, v16.8B
188 sli v24.8H, v24.8H, #8
189 shl v4.8H, v4.8H, #2
190 uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
191 uaddw v4.8H, v4.8H, v18.8B
192 cmhi v26.8B, v22.8B, v26.8B // < alpha
193 usubw v4.8H, v4.8H, v2.8B
194 dup v22.8B, w3 // beta
195 rshrn v4.8B, v4.8H, #3
196 cmhi v28.8B, v22.8B, v28.8B // < beta
197 cmhi v30.8B, v22.8B, v30.8B // < beta
198 smin v4.8B, v4.8B, v24.8B
199 neg v25.8B, v24.8B
200 and v26.8B, v26.8B, v28.8B
201 smax v4.8B, v4.8B, v25.8B
202 and v26.8B, v26.8B, v30.8B
203 uxtl v22.8H, v0.8B
204 and v4.8B, v4.8B, v26.8B
205 uxtl v28.8H, v16.8B
206 saddw v28.8H, v28.8H, v4.8B
207 ssubw v22.8H, v22.8H, v4.8B
208 sqxtun v16.8B, v28.8H
209 sqxtun v0.8B, v22.8H
210.endm
211
212function ff_h264_v_loop_filter_chroma_neon, export=1
213 h264_loop_filter_start
bb515e3a 214 sxtw x1, w1
36e3b1f2
JG
215
216 sub x0, x0, x1, lsl #1
217 ld1 {v18.8B}, [x0], x1
218 ld1 {v16.8B}, [x0], x1
219 ld1 {v0.8B}, [x0], x1
220 ld1 {v2.8B}, [x0]
221
222 h264_loop_filter_chroma
223
224 sub x0, x0, x1, lsl #1
225 st1 {v16.8B}, [x0], x1
226 st1 {v0.8B}, [x0], x1
227
228 ret
229endfunc
230
231function ff_h264_h_loop_filter_chroma_neon, export=1
232 h264_loop_filter_start
bb515e3a 233 sxtw x1, w1
36e3b1f2
JG
234
235 sub x0, x0, #2
236 ld1 {v18.S}[0], [x0], x1
237 ld1 {v16.S}[0], [x0], x1
238 ld1 {v0.S}[0], [x0], x1
239 ld1 {v2.S}[0], [x0], x1
240 ld1 {v18.S}[1], [x0], x1
241 ld1 {v16.S}[1], [x0], x1
242 ld1 {v0.S}[1], [x0], x1
243 ld1 {v2.S}[1], [x0], x1
244
245 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
246
247 h264_loop_filter_chroma
248
249 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
250
251 sub x0, x0, x1, lsl #3
252 st1 {v18.S}[0], [x0], x1
253 st1 {v16.S}[0], [x0], x1
254 st1 {v0.S}[0], [x0], x1
255 st1 {v2.S}[0], [x0], x1
256 st1 {v18.S}[1], [x0], x1
257 st1 {v16.S}[1], [x0], x1
258 st1 {v0.S}[1], [x0], x1
259 st1 {v2.S}[1], [x0], x1
260
261 ret
262endfunc
f896bca0
JG
263
264.macro biweight_16 macs, macd
265 dup v0.16B, w5
266 dup v1.16B, w6
267 mov v4.16B, v16.16B
268 mov v6.16B, v16.16B
2691: subs w3, w3, #2
270 ld1 {v20.16B}, [x0], x2
271 \macd v4.8H, v0.8B, v20.8B
272 \macd\()2 v6.8H, v0.16B, v20.16B
273 ld1 {v22.16B}, [x1], x2
274 \macs v4.8H, v1.8B, v22.8B
275 \macs\()2 v6.8H, v1.16B, v22.16B
276 mov v24.16B, v16.16B
277 ld1 {v28.16B}, [x0], x2
278 mov v26.16B, v16.16B
279 \macd v24.8H, v0.8B, v28.8B
280 \macd\()2 v26.8H, v0.16B, v28.16B
281 ld1 {v30.16B}, [x1], x2
282 \macs v24.8H, v1.8B, v30.8B
283 \macs\()2 v26.8H, v1.16B, v30.16B
284 sshl v4.8H, v4.8H, v18.8H
285 sshl v6.8H, v6.8H, v18.8H
286 sqxtun v4.8B, v4.8H
287 sqxtun2 v4.16B, v6.8H
288 sshl v24.8H, v24.8H, v18.8H
289 sshl v26.8H, v26.8H, v18.8H
290 sqxtun v24.8B, v24.8H
291 sqxtun2 v24.16B, v26.8H
292 mov v6.16B, v16.16B
293 st1 {v4.16B}, [x7], x2
294 mov v4.16B, v16.16B
295 st1 {v24.16B}, [x7], x2
296 b.ne 1b
297 ret
298.endm
299
300.macro biweight_8 macs, macd
301 dup v0.8B, w5
302 dup v1.8B, w6
303 mov v2.16B, v16.16B
304 mov v20.16B, v16.16B
3051: subs w3, w3, #2
306 ld1 {v4.8B}, [x0], x2
307 \macd v2.8H, v0.8B, v4.8B
308 ld1 {v5.8B}, [x1], x2
309 \macs v2.8H, v1.8B, v5.8B
310 ld1 {v6.8B}, [x0], x2
311 \macd v20.8H, v0.8B, v6.8B
312 ld1 {v7.8B}, [x1], x2
313 \macs v20.8H, v1.8B, v7.8B
314 sshl v2.8H, v2.8H, v18.8H
315 sqxtun v2.8B, v2.8H
316 sshl v20.8H, v20.8H, v18.8H
317 sqxtun v4.8B, v20.8H
318 mov v20.16B, v16.16B
319 st1 {v2.8B}, [x7], x2
320 mov v2.16B, v16.16B
321 st1 {v4.8B}, [x7], x2
322 b.ne 1b
323 ret
324.endm
325
326.macro biweight_4 macs, macd
327 dup v0.8B, w5
328 dup v1.8B, w6
329 mov v2.16B, v16.16B
330 mov v20.16B,v16.16B
3311: subs w3, w3, #4
332 ld1 {v4.S}[0], [x0], x2
333 ld1 {v4.S}[1], [x0], x2
334 \macd v2.8H, v0.8B, v4.8B
335 ld1 {v5.S}[0], [x1], x2
336 ld1 {v5.S}[1], [x1], x2
337 \macs v2.8H, v1.8B, v5.8B
338 b.lt 2f
339 ld1 {v6.S}[0], [x0], x2
340 ld1 {v6.S}[1], [x0], x2
341 \macd v20.8H, v0.8B, v6.8B
342 ld1 {v7.S}[0], [x1], x2
343 ld1 {v7.S}[1], [x1], x2
344 \macs v20.8H, v1.8B, v7.8B
345 sshl v2.8H, v2.8H, v18.8H
346 sqxtun v2.8B, v2.8H
347 sshl v20.8H, v20.8H, v18.8H
348 sqxtun v4.8B, v20.8H
349 mov v20.16B, v16.16B
350 st1 {v2.S}[0], [x7], x2
351 st1 {v2.S}[1], [x7], x2
352 mov v2.16B, v16.16B
353 st1 {v4.S}[0], [x7], x2
354 st1 {v4.S}[1], [x7], x2
355 b.ne 1b
356 ret
3572: sshl v2.8H, v2.8H, v18.8H
358 sqxtun v2.8B, v2.8H
359 st1 {v2.S}[0], [x7], x2
360 st1 {v2.S}[1], [x7], x2
361 ret
362.endm
363
364.macro biweight_func w
365function ff_biweight_h264_pixels_\w\()_neon, export=1
366 sxtw x2, w2
367 lsr w8, w5, #31
368 add w7, w7, #1
369 eor w8, w8, w6, lsr #30
370 orr w7, w7, #1
371 dup v18.8H, w4
372 lsl w7, w7, w4
373 not v18.16B, v18.16B
374 dup v16.8H, w7
375 mov x7, x0
376 cbz w8, 10f
377 subs w8, w8, #1
378 b.eq 20f
379 subs w8, w8, #1
380 b.eq 30f
381 b 40f
38210: biweight_\w umlal, umlal
38320: neg w5, w5
384 biweight_\w umlal, umlsl
38530: neg w5, w5
386 neg w6, w6
387 biweight_\w umlsl, umlsl
38840: neg w6, w6
389 biweight_\w umlsl, umlal
390endfunc
391.endm
392
393 biweight_func 16
394 biweight_func 8
395 biweight_func 4
396
397.macro weight_16 add
398 dup v0.16B, w4
3991: subs w2, w2, #2
400 ld1 {v20.16B}, [x0], x1
401 umull v4.8H, v0.8B, v20.8B
402 umull2 v6.8H, v0.16B, v20.16B
403 ld1 {v28.16B}, [x0], x1
404 umull v24.8H, v0.8B, v28.8B
405 umull2 v26.8H, v0.16B, v28.16B
406 \add v4.8H, v16.8H, v4.8H
407 srshl v4.8H, v4.8H, v18.8H
408 \add v6.8H, v16.8H, v6.8H
409 srshl v6.8H, v6.8H, v18.8H
410 sqxtun v4.8B, v4.8H
411 sqxtun2 v4.16B, v6.8H
412 \add v24.8H, v16.8H, v24.8H
413 srshl v24.8H, v24.8H, v18.8H
414 \add v26.8H, v16.8H, v26.8H
415 srshl v26.8H, v26.8H, v18.8H
416 sqxtun v24.8B, v24.8H
417 sqxtun2 v24.16B, v26.8H
418 st1 {v4.16B}, [x5], x1
419 st1 {v24.16B}, [x5], x1
420 b.ne 1b
421 ret
422.endm
423
424.macro weight_8 add
425 dup v0.8B, w4
4261: subs w2, w2, #2
427 ld1 {v4.8B}, [x0], x1
428 umull v2.8H, v0.8B, v4.8B
429 ld1 {v6.8B}, [x0], x1
430 umull v20.8H, v0.8B, v6.8B
431 \add v2.8H, v16.8H, v2.8H
432 srshl v2.8H, v2.8H, v18.8H
433 sqxtun v2.8B, v2.8H
434 \add v20.8H, v16.8H, v20.8H
435 srshl v20.8H, v20.8H, v18.8H
436 sqxtun v4.8B, v20.8H
437 st1 {v2.8B}, [x5], x1
438 st1 {v4.8B}, [x5], x1
439 b.ne 1b
440 ret
441.endm
442
443.macro weight_4 add
444 dup v0.8B, w4
4451: subs w2, w2, #4
446 ld1 {v4.S}[0], [x0], x1
447 ld1 {v4.S}[1], [x0], x1
448 umull v2.8H, v0.8B, v4.8B
449 b.lt 2f
450 ld1 {v6.S}[0], [x0], x1
451 ld1 {v6.S}[1], [x0], x1
452 umull v20.8H, v0.8B, v6.8B
453 \add v2.8H, v16.8H, v2.8H
454 srshl v2.8H, v2.8H, v18.8H
455 sqxtun v2.8B, v2.8H
456 \add v20.8H, v16.8H, v20.8H
457 srshl v20.8H, v20.8h, v18.8H
458 sqxtun v4.8B, v20.8H
459 st1 {v2.S}[0], [x5], x1
460 st1 {v2.S}[1], [x5], x1
461 st1 {v4.S}[0], [x5], x1
462 st1 {v4.S}[1], [x5], x1
463 b.ne 1b
464 ret
4652: \add v2.8H, v16.8H, v2.8H
466 srshl v2.8H, v2.8H, v18.8H
467 sqxtun v2.8B, v2.8H
468 st1 {v2.S}[0], [x5], x1
469 st1 {v2.S}[1], [x5], x1
470 ret
471.endm
472
473.macro weight_func w
474function ff_weight_h264_pixels_\w\()_neon, export=1
475 sxtw x1, w1
476 cmp w3, #1
477 mov w6, #1
478 lsl w5, w5, w3
479 dup v16.8H, w5
480 mov x5, x0
481 b.le 20f
482 sub w6, w6, w3
483 dup v18.8H, w6
484 cmp w4, #0
485 b.lt 10f
486 weight_\w shadd
48710: neg w4, w4
488 weight_\w shsub
48920: neg w6, w3
490 dup v18.8H, w6
491 cmp w4, #0
492 b.lt 10f
493 weight_\w add
49410: neg w4, w4
495 weight_\w sub
496endfunc
497.endm
498
499 weight_func 16
500 weight_func 8
501 weight_func 4