aarch64: vp9itxfm: Fix incorrect vertical alignment
[libav.git] / libavcodec / aarch64 / vp9itxfm_neon.S
1 /*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/aarch64/asm.S"
22 #include "neon.S"
23
24 const itxfm4_coeffs, align=4
25 .short 11585, 6270, 15137, 0
26 iadst4_coeffs:
27 .short 5283, 15212, 9929, 13377
28 endconst
29
30 const iadst8_coeffs, align=4
31 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32 idct_coeffs:
33 .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
34 .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
35 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37 endconst
38
39 const iadst16_coeffs, align=4
40 .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
41 .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
42 endconst
43
44 // out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
45 // out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
46 // in/out are .8h registers; this can do with 4 temp registers, but is
47 // more efficient if 6 temp registers are available.
48 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
49 .if \neg > 0
50 neg \tmp4\().4h, v0.4h
51 .endif
52 add \tmp1\().8h, \in1\().8h, \in2\().8h
53 sub \tmp2\().8h, \in1\().8h, \in2\().8h
54 .if \neg > 0
55 smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
56 smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
57 .else
58 smull \tmp3\().4s, \tmp1\().4h, v0.h[0]
59 smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0]
60 .endif
61 .ifb \tmp5
62 rshrn \out1\().4h, \tmp3\().4s, #14
63 rshrn2 \out1\().8h, \tmp4\().4s, #14
64 smull \tmp3\().4s, \tmp2\().4h, v0.h[0]
65 smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0]
66 rshrn \out2\().4h, \tmp3\().4s, #14
67 rshrn2 \out2\().8h, \tmp4\().4s, #14
68 .else
69 smull \tmp5\().4s, \tmp2\().4h, v0.h[0]
70 smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0]
71 rshrn \out1\().4h, \tmp3\().4s, #14
72 rshrn2 \out1\().8h, \tmp4\().4s, #14
73 rshrn \out2\().4h, \tmp5\().4s, #14
74 rshrn2 \out2\().8h, \tmp6\().4s, #14
75 .endif
76 .endm
77
78 // Same as dmbutterfly0 above, but treating the input in in2 as zero,
79 // writing the same output into both out1 and out2.
80 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
81 smull \tmp1\().4s, \in1\().4h, v0.h[0]
82 smull2 \tmp2\().4s, \in1\().8h, v0.h[0]
83 rshrn \out1\().4h, \tmp1\().4s, #14
84 rshrn2 \out1\().8h, \tmp2\().4s, #14
85 rshrn \out2\().4h, \tmp1\().4s, #14
86 rshrn2 \out2\().8h, \tmp2\().4s, #14
87 .endm
88
89 // out1,out2 = in1 * coef1 - in2 * coef2
90 // out3,out4 = in1 * coef2 + in2 * coef1
91 // out are 4 x .4s registers, in are 2 x .8h registers
92 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
93 smull \out1\().4s, \in1\().4h, \coef1
94 smull2 \out2\().4s, \in1\().8h, \coef1
95 smull \out3\().4s, \in1\().4h, \coef2
96 smull2 \out4\().4s, \in1\().8h, \coef2
97 smlsl \out1\().4s, \in2\().4h, \coef2
98 smlsl2 \out2\().4s, \in2\().8h, \coef2
99 smlal \out3\().4s, \in2\().4h, \coef1
100 smlal2 \out4\().4s, \in2\().8h, \coef1
101 .endm
102
103 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
104 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
105 // inout are 2 x .8h registers
106 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
107 dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
108 .if \neg > 0
109 neg \tmp3\().4s, \tmp3\().4s
110 neg \tmp4\().4s, \tmp4\().4s
111 .endif
112 rshrn \inout1\().4h, \tmp1\().4s, #14
113 rshrn2 \inout1\().8h, \tmp2\().4s, #14
114 rshrn \inout2\().4h, \tmp3\().4s, #14
115 rshrn2 \inout2\().8h, \tmp4\().4s, #14
116 .endm
117
118 // Same as dmbutterfly above, but treating the input in inout2 as zero
119 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
120 smull \tmp1\().4s, \inout1\().4h, \coef1
121 smull2 \tmp2\().4s, \inout1\().8h, \coef1
122 smull \tmp3\().4s, \inout1\().4h, \coef2
123 smull2 \tmp4\().4s, \inout1\().8h, \coef2
124 rshrn \inout1\().4h, \tmp1\().4s, #14
125 rshrn2 \inout1\().8h, \tmp2\().4s, #14
126 rshrn \inout2\().4h, \tmp3\().4s, #14
127 rshrn2 \inout2\().8h, \tmp4\().4s, #14
128 .endm
129
130 // Same as dmbutterfly above, but treating the input in inout1 as zero
131 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
132 smull \tmp1\().4s, \inout2\().4h, \coef2
133 smull2 \tmp2\().4s, \inout2\().8h, \coef2
134 smull \tmp3\().4s, \inout2\().4h, \coef1
135 smull2 \tmp4\().4s, \inout2\().8h, \coef1
136 neg \tmp1\().4s, \tmp1\().4s
137 neg \tmp2\().4s, \tmp2\().4s
138 rshrn \inout2\().4h, \tmp3\().4s, #14
139 rshrn2 \inout2\().8h, \tmp4\().4s, #14
140 rshrn \inout1\().4h, \tmp1\().4s, #14
141 rshrn2 \inout1\().8h, \tmp2\().4s, #14
142 .endm
143
144 .macro dsmull_h out1, out2, in, coef
145 smull \out1\().4s, \in\().4h, \coef
146 smull2 \out2\().4s, \in\().8h, \coef
147 .endm
148
149 .macro drshrn_h out, in1, in2, shift
150 rshrn \out\().4h, \in1\().4s, \shift
151 rshrn2 \out\().8h, \in2\().4s, \shift
152 .endm
153
154
155 // out1 = in1 + in2
156 // out2 = in1 - in2
157 .macro butterfly_8h out1, out2, in1, in2
158 add \out1\().8h, \in1\().8h, \in2\().8h
159 sub \out2\().8h, \in1\().8h, \in2\().8h
160 .endm
161
162 // out1 = in1 - in2
163 // out2 = in1 + in2
164 .macro butterfly_8h_r out1, out2, in1, in2
165 sub \out1\().8h, \in1\().8h, \in2\().8h
166 add \out2\().8h, \in1\().8h, \in2\().8h
167 .endm
168
169 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
170 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
171 // out are 2 x .8h registers, in are 4 x .4s registers
172 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
173 add \tmp1\().4s, \in1\().4s, \in3\().4s
174 add \tmp2\().4s, \in2\().4s, \in4\().4s
175 sub \tmp3\().4s, \in1\().4s, \in3\().4s
176 sub \tmp4\().4s, \in2\().4s, \in4\().4s
177 rshrn \out1\().4h, \tmp1\().4s, #14
178 rshrn2 \out1\().8h, \tmp2\().4s, #14
179 rshrn \out2\().4h, \tmp3\().4s, #14
180 rshrn2 \out2\().8h, \tmp4\().4s, #14
181 .endm
182
183 .macro iwht4 c0, c1, c2, c3
184 add \c0\().4h, \c0\().4h, \c1\().4h
185 sub v17.4h, \c2\().4h, \c3\().4h
186 sub v16.4h, \c0\().4h, v17.4h
187 sshr v16.4h, v16.4h, #1
188 sub \c2\().4h, v16.4h, \c1\().4h
189 sub \c1\().4h, v16.4h, \c3\().4h
190 add \c3\().4h, v17.4h, \c2\().4h
191 sub \c0\().4h, \c0\().4h, \c1\().4h
192 .endm
193
194 .macro idct4 c0, c1, c2, c3
195 smull v22.4s, \c1\().4h, v0.h[2]
196 smull v20.4s, \c1\().4h, v0.h[1]
197 add v16.4h, \c0\().4h, \c2\().4h
198 sub v17.4h, \c0\().4h, \c2\().4h
199 smlal v22.4s, \c3\().4h, v0.h[1]
200 smull v18.4s, v16.4h, v0.h[0]
201 smull v19.4s, v17.4h, v0.h[0]
202 smlsl v20.4s, \c3\().4h, v0.h[2]
203 rshrn v22.4h, v22.4s, #14
204 rshrn v18.4h, v18.4s, #14
205 rshrn v19.4h, v19.4s, #14
206 rshrn v20.4h, v20.4s, #14
207 add \c0\().4h, v18.4h, v22.4h
208 sub \c3\().4h, v18.4h, v22.4h
209 add \c1\().4h, v19.4h, v20.4h
210 sub \c2\().4h, v19.4h, v20.4h
211 .endm
212
213 .macro iadst4 c0, c1, c2, c3
214 smull v16.4s, \c0\().4h, v0.h[4]
215 smlal v16.4s, \c2\().4h, v0.h[5]
216 smlal v16.4s, \c3\().4h, v0.h[6]
217 smull v17.4s, \c0\().4h, v0.h[6]
218 smlsl v17.4s, \c2\().4h, v0.h[4]
219 sub \c0\().4h, \c0\().4h, \c2\().4h
220 smlsl v17.4s, \c3\().4h, v0.h[5]
221 add \c0\().4h, \c0\().4h, \c3\().4h
222 smull v19.4s, \c1\().4h, v0.h[7]
223 smull v18.4s, \c0\().4h, v0.h[7]
224 add v20.4s, v16.4s, v19.4s
225 add v21.4s, v17.4s, v19.4s
226 rshrn \c0\().4h, v20.4s, #14
227 add v16.4s, v16.4s, v17.4s
228 rshrn \c1\().4h, v21.4s, #14
229 sub v16.4s, v16.4s, v19.4s
230 rshrn \c2\().4h, v18.4s, #14
231 rshrn \c3\().4h, v16.4s, #14
232 .endm
233
234 // The public functions in this file have got the following signature:
235 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
236
237 .macro itxfm_func4x4 txfm1, txfm2
238 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
239 .ifc \txfm1,\txfm2
240 .ifc \txfm1,idct
241 movrel x4, itxfm4_coeffs
242 ld1 {v0.4h}, [x4]
243 .endif
244 .ifc \txfm1,iadst
245 movrel x4, iadst4_coeffs
246 ld1 {v0.d}[1], [x4]
247 .endif
248 .else
249 movrel x4, itxfm4_coeffs
250 ld1 {v0.8h}, [x4]
251 .endif
252
253 movi v31.8h, #0
254 .ifc \txfm1\()_\txfm2,idct_idct
255 cmp w3, #1
256 b.ne 1f
257 // DC-only for idct/idct
258 ld1 {v2.h}[0], [x2]
259 smull v2.4s, v2.4h, v0.h[0]
260 rshrn v2.4h, v2.4s, #14
261 smull v2.4s, v2.4h, v0.h[0]
262 rshrn v2.4h, v2.4s, #14
263 st1 {v31.h}[0], [x2]
264 dup v4.4h, v2.h[0]
265 mov v5.16b, v4.16b
266 mov v6.16b, v4.16b
267 mov v7.16b, v4.16b
268 b 2f
269 .endif
270
271 1:
272 ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2]
273 st1 {v31.8h}, [x2], #16
274
275 .ifc \txfm1,iwht
276 sshr v4.4h, v4.4h, #2
277 sshr v5.4h, v5.4h, #2
278 sshr v6.4h, v6.4h, #2
279 sshr v7.4h, v7.4h, #2
280 .endif
281
282 \txfm1\()4 v4, v5, v6, v7
283
284 st1 {v31.8h}, [x2], #16
285 // Transpose 4x4 with 16 bit elements
286 transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19
287
288 \txfm2\()4 v4, v5, v6, v7
289 2:
290 ld1 {v0.s}[0], [x0], x1
291 ld1 {v1.s}[0], [x0], x1
292 .ifnc \txfm1,iwht
293 srshr v4.4h, v4.4h, #4
294 srshr v5.4h, v5.4h, #4
295 srshr v6.4h, v6.4h, #4
296 srshr v7.4h, v7.4h, #4
297 .endif
298 uaddw v4.8h, v4.8h, v0.8b
299 uaddw v5.8h, v5.8h, v1.8b
300 ld1 {v2.s}[0], [x0], x1
301 ld1 {v3.s}[0], [x0], x1
302 sqxtun v0.8b, v4.8h
303 sqxtun v1.8b, v5.8h
304 sub x0, x0, x1, lsl #2
305
306 uaddw v6.8h, v6.8h, v2.8b
307 uaddw v7.8h, v7.8h, v3.8b
308 st1 {v0.s}[0], [x0], x1
309 sqxtun v2.8b, v6.8h
310 sqxtun v3.8b, v7.8h
311
312 st1 {v1.s}[0], [x0], x1
313 st1 {v2.s}[0], [x0], x1
314 st1 {v3.s}[0], [x0], x1
315
316 ret
317 endfunc
318 .endm
319
320 itxfm_func4x4 idct, idct
321 itxfm_func4x4 iadst, idct
322 itxfm_func4x4 idct, iadst
323 itxfm_func4x4 iadst, iadst
324 itxfm_func4x4 iwht, iwht
325
326
327 .macro idct8
328 dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
329 dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
330 dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
331 dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
332
333 butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3
334 butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a
335 butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a
336 butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2
337
338 dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
339
340 butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
341 butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
342 butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
343 butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
344 .endm
345
346 .macro iadst8
347 dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a
348 dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a
349 dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a
350 dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a
351
352 dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4
353 dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5
354 dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6
355 dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7
356
357 butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2
358 butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3
359 neg v23.8h, v23.8h // v23 = out[7]
360
361 dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4]
362 neg v19.8h, v19.8h // v19 = out[3]
363
364 dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[1], v0.h[2] // v26,v27 = t5a, v28,v29 = t4a
365 dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[2], v0.h[1] // v2,v3 = t6a, v4,v5 = t7a
366
367 dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6
368 dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7
369 neg v17.8h, v17.8h // v17 = out[1]
370
371 dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5]
372 neg v21.8h, v21.8h // v21 = out[5]
373 .endm
374
375
376 .macro itxfm_func8x8 txfm1, txfm2
377 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
378 // The iadst also uses a few coefficients from
379 // idct, so those always need to be loaded.
380 .ifc \txfm1\()_\txfm2,idct_idct
381 movrel x4, idct_coeffs
382 .else
383 movrel x4, iadst8_coeffs
384 ld1 {v1.8h}, [x4], #16
385 .endif
386 ld1 {v0.8h}, [x4]
387
388 movi v2.8h, #0
389 movi v3.8h, #0
390 movi v4.8h, #0
391 movi v5.8h, #0
392
393 .ifc \txfm1\()_\txfm2,idct_idct
394 cmp w3, #1
395 b.ne 1f
396 // DC-only for idct/idct
397 ld1 {v2.h}[0], [x2]
398 smull v2.4s, v2.4h, v0.h[0]
399 rshrn v2.4h, v2.4s, #14
400 smull v2.4s, v2.4h, v0.h[0]
401 rshrn v2.4h, v2.4s, #14
402 st1 {v3.h}[0], [x2]
403 dup v16.8h, v2.h[0]
404 mov v17.16b, v16.16b
405 mov v18.16b, v16.16b
406 mov v19.16b, v16.16b
407 mov v20.16b, v16.16b
408 mov v21.16b, v16.16b
409 mov v22.16b, v16.16b
410 mov v23.16b, v16.16b
411 b 2f
412 .endif
413 1:
414 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64
415 ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64
416 sub x2, x2, #128
417 st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
418 st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
419
420 \txfm1\()8
421
422 // Transpose 8x8 with 16 bit elements
423 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
424
425 \txfm2\()8
426 2:
427 mov x3, x0
428 // Add into the destination
429 ld1 {v0.8b}, [x0], x1
430 srshr v16.8h, v16.8h, #5
431 ld1 {v1.8b}, [x0], x1
432 srshr v17.8h, v17.8h, #5
433 ld1 {v2.8b}, [x0], x1
434 srshr v18.8h, v18.8h, #5
435 uaddw v16.8h, v16.8h, v0.8b
436 ld1 {v3.8b}, [x0], x1
437 srshr v19.8h, v19.8h, #5
438 uaddw v17.8h, v17.8h, v1.8b
439 ld1 {v4.8b}, [x0], x1
440 srshr v20.8h, v20.8h, #5
441 uaddw v18.8h, v18.8h, v2.8b
442 sqxtun v0.8b, v16.8h
443 ld1 {v5.8b}, [x0], x1
444 srshr v21.8h, v21.8h, #5
445 uaddw v19.8h, v19.8h, v3.8b
446 sqxtun v1.8b, v17.8h
447 ld1 {v6.8b}, [x0], x1
448 srshr v22.8h, v22.8h, #5
449 uaddw v20.8h, v20.8h, v4.8b
450 sqxtun v2.8b, v18.8h
451 ld1 {v7.8b}, [x0], x1
452 srshr v23.8h, v23.8h, #5
453 uaddw v21.8h, v21.8h, v5.8b
454 sqxtun v3.8b, v19.8h
455
456 st1 {v0.8b}, [x3], x1
457 uaddw v22.8h, v22.8h, v6.8b
458 st1 {v1.8b}, [x3], x1
459 sqxtun v4.8b, v20.8h
460 st1 {v2.8b}, [x3], x1
461 uaddw v23.8h, v23.8h, v7.8b
462 st1 {v3.8b}, [x3], x1
463 sqxtun v5.8b, v21.8h
464 st1 {v4.8b}, [x3], x1
465 sqxtun v6.8b, v22.8h
466 st1 {v5.8b}, [x3], x1
467 sqxtun v7.8b, v23.8h
468
469 st1 {v6.8b}, [x3], x1
470 st1 {v7.8b}, [x3], x1
471
472 ret
473 endfunc
474 .endm
475
476 itxfm_func8x8 idct, idct
477 itxfm_func8x8 iadst, idct
478 itxfm_func8x8 idct, iadst
479 itxfm_func8x8 iadst, iadst
480
481
482 function idct16x16_dc_add_neon
483 movrel x4, idct_coeffs
484 ld1 {v0.4h}, [x4]
485
486 movi v1.4h, #0
487
488 ld1 {v2.h}[0], [x2]
489 smull v2.4s, v2.4h, v0.h[0]
490 rshrn v2.4h, v2.4s, #14
491 smull v2.4s, v2.4h, v0.h[0]
492 rshrn v2.4h, v2.4s, #14
493 dup v2.8h, v2.h[0]
494 st1 {v1.h}[0], [x2]
495
496 srshr v2.8h, v2.8h, #6
497
498 mov x4, #16
499 1:
500 // Loop to add the constant from v2 into all 16x16 outputs
501 ld1 {v3.16b}, [x0]
502 uaddw v4.8h, v2.8h, v3.8b
503 uaddw2 v5.8h, v2.8h, v3.16b
504 sqxtun v4.8b, v4.8h
505 sqxtun2 v4.16b, v5.8h
506 st1 {v4.16b}, [x0], x1
507 subs x4, x4, #1
508 b.ne 1b
509
510 ret
511 endfunc
512
513 .macro idct16_end
514 butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a
515 butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6
516 butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5
517 butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4
518 butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a
519 butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10
520 butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13
521 butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a
522
523 dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a
524 dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
525
526 butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
527 butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
528 butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
529 butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
530 butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13]
531 butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
532 butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
533 butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10]
534 ret
535 .endm
536
537 function idct16
538 dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
539 dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
540 dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
541 dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
542 dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
543 dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
544 dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
545 dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
546
547 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
548 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
549 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
550 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
551 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
552 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
553 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
554 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
555
556 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
557 dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
558 dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
559 idct16_end
560 endfunc
561
562 function idct16_half
563 dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
564 dmbutterfly_h1 v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
565 dmbutterfly_h1 v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
566 dmbutterfly_h2 v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
567 dmbutterfly_h1 v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
568 dmbutterfly_h2 v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
569 dmbutterfly_h1 v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
570 dmbutterfly_h2 v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
571
572 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
573 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
574 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
575 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
576 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
577 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
578 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
579 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
580
581 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
582 dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
583 dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
584 idct16_end
585 endfunc
586
587 function idct16_quarter
588 dsmull_h v24, v25, v19, v1.h[6]
589 dsmull_h v4, v5, v17, v0.h[7]
590 dsmull_h v7, v6, v18, v0.h[4]
591 dsmull_h v30, v31, v18, v0.h[3]
592 neg v24.4s, v24.4s
593 neg v25.4s, v25.4s
594 dsmull_h v29, v28, v17, v1.h[0]
595 dsmull_h v26, v27, v19, v1.h[5]
596 dsmull_h v22, v23, v16, v0.h[0]
597 drshrn_h v24, v24, v25, #14
598 drshrn_h v16, v4, v5, #14
599 drshrn_h v7, v7, v6, #14
600 drshrn_h v6, v30, v31, #14
601 drshrn_h v29, v29, v28, #14
602 drshrn_h v17, v26, v27, #14
603 drshrn_h v28, v22, v23, #14
604
605 dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2]
606 dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2]
607 neg v22.4s, v22.4s
608 neg v23.4s, v23.4s
609 drshrn_h v27, v20, v21, #14
610 drshrn_h v21, v22, v23, #14
611 drshrn_h v23, v18, v19, #14
612 drshrn_h v25, v30, v31, #14
613 mov v4.16b, v28.16b
614 mov v5.16b, v28.16b
615 dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
616 mov v20.16b, v28.16b
617 idct16_end
618 endfunc
619
620 function iadst16
621 ld1 {v0.8h,v1.8h}, [x11]
622
623 dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0
624 dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.h[1], v1.h[0] // v10,v11 = t9, v8,v9 = t8
625 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
626 dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2
627 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
628
629 dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.h[3], v1.h[2] // v6,v7 = t11, v4,v5 = t10
630 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
631 dmbutterfly_l v10, v11, v8, v9, v27, v20, v0.h[5], v0.h[4] // v10,v11 = t5, v8,v9 = t4
632 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
633
634 dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12
635 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
636 dmbutterfly_l v6, v7, v4, v5, v25, v22, v0.h[7], v0.h[6] // v6,v7 = t7, v4,v5 = t6
637 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
638
639 dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14
640 ld1 {v0.8h}, [x10]
641 dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
642 dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4] // v14,v15 = t9, v12,v13 = t8
643 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
644
645 dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[4], v0.h[3] // v4,v5 = t12, v6,v7 = t13
646 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
647 dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[5], v0.h[6] // v10,v11 = t11, v8,v9 = t10
648 butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0
649 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
650
651 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5] // v12,v13 = t14, v14,v15 = t15
652 butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1
653 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
654 dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
655
656 butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2
657 butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3
658
659 dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[1], v0.h[2] // v10,v11 = t13, v8,v9 = t12
660 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1] // v12,v13 = t14, v14,v15 = t15
661
662 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
663 dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
664 neg v29.8h, v29.8h // v29 = out[13]
665
666 dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[1], v0.h[2] // v10,v11 = t5a, v8,v9 = t4a
667 dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[2], v0.h[1] // v12,v13 = t6a, v14,v15 = t7a
668
669 butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a
670 butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10
671
672 dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
673 neg v19.8h, v19.8h // v19 = out[3]
674 dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
675
676 butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
677 butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11
678
679 dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
680 dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
681 dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
682 dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
683
684 neg v31.8h, v5.8h // v31 = out[15]
685 neg v17.8h, v3.8h // v17 = out[1]
686
687 mov v16.16b, v2.16b
688 mov v30.16b, v4.16b
689 ret
690 endfunc
691
692 // Helper macros; we can't use these expressions directly within
693 // e.g. .irp due to the extra concatenation \(). Therefore wrap
694 // them in macros to allow using .irp below.
695 .macro load i, src, inc
696 ld1 {v\i\().8h}, [\src], \inc
697 .endm
698 .macro store i, dst, inc
699 st1 {v\i\().8h}, [\dst], \inc
700 .endm
701 .macro movi_v i, size, imm
702 movi v\i\()\size, \imm
703 .endm
704 .macro load_clear i, src, inc
705 ld1 {v\i\().8h}, [\src]
706 st1 {v2.8h}, [\src], \inc
707 .endm
708
709 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
710 srshr \coef0, \coef0, #6
711 ld1 {v2.8b}, [x0], x1
712 srshr \coef1, \coef1, #6
713 ld1 {v3.8b}, [x3], x1
714 srshr \coef2, \coef2, #6
715 ld1 {v4.8b}, [x0], x1
716 srshr \coef3, \coef3, #6
717 uaddw \coef0, \coef0, v2.8b
718 ld1 {v5.8b}, [x3], x1
719 uaddw \coef1, \coef1, v3.8b
720 srshr \coef4, \coef4, #6
721 ld1 {v6.8b}, [x0], x1
722 srshr \coef5, \coef5, #6
723 ld1 {v7.8b}, [x3], x1
724 sqxtun v2.8b, \coef0
725 srshr \coef6, \coef6, #6
726 sqxtun v3.8b, \coef1
727 srshr \coef7, \coef7, #6
728 uaddw \coef2, \coef2, v4.8b
729 ld1 {\tmp1}, [x0], x1
730 uaddw \coef3, \coef3, v5.8b
731 ld1 {\tmp2}, [x3], x1
732 sqxtun v4.8b, \coef2
733 sub x0, x0, x1, lsl #2
734 sub x3, x3, x1, lsl #2
735 sqxtun v5.8b, \coef3
736 uaddw \coef4, \coef4, v6.8b
737 st1 {v2.8b}, [x0], x1
738 uaddw \coef5, \coef5, v7.8b
739 st1 {v3.8b}, [x3], x1
740 sqxtun v6.8b, \coef4
741 st1 {v4.8b}, [x0], x1
742 sqxtun v7.8b, \coef5
743 st1 {v5.8b}, [x3], x1
744 uaddw \coef6, \coef6, \tmp1
745 st1 {v6.8b}, [x0], x1
746 uaddw \coef7, \coef7, \tmp2
747 st1 {v7.8b}, [x3], x1
748 sqxtun \tmp1, \coef6
749 sqxtun \tmp2, \coef7
750 st1 {\tmp1}, [x0], x1
751 st1 {\tmp2}, [x3], x1
752 .endm
753
754 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
755 // transpose into a horizontal 16x8 slice and store.
756 // x0 = dst (temp buffer)
757 // x1 = slice offset
758 // x2 = src
759 // x9 = input stride
760 .macro itxfm16_1d_funcs txfm
761 function \txfm\()16_1d_8x16_pass1_neon
762 mov x14, x30
763
764 movi v2.8h, #0
765 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
766 load_clear \i, x2, x9
767 .endr
768
769 bl \txfm\()16
770
771 // Do two 8x8 transposes. Originally, v16-v31 contain the
772 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
773 // transposed 8x8 blocks.
774 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
775 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
776
777 // Store the transposed 8x8 blocks horizontally.
778 cmp x1, #8
779 b.eq 1f
780 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
781 store \i, x0, #16
782 .endr
783 br x14
784 1:
785 // Special case: For the last input column (x1 == 8),
786 // which would be stored as the last row in the temp buffer,
787 // don't store the first 8x8 block, but keep it in registers
788 // for the first slice of the second pass (where it is the
789 // last 8x8 block).
790 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
791 add x0, x0, #16
792 store \i, x0, #16
793 .endr
794 mov v24.16b, v16.16b
795 mov v25.16b, v17.16b
796 mov v26.16b, v18.16b
797 mov v27.16b, v19.16b
798 mov v28.16b, v20.16b
799 mov v29.16b, v21.16b
800 mov v30.16b, v22.16b
801 mov v31.16b, v23.16b
802 br x14
803 endfunc
804
805 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
806 // load the destination pixels (from a similar 8x16 slice), add and store back.
807 // x0 = dst
808 // x1 = dst stride
809 // x2 = src (temp buffer)
810 // x3 = slice offset
811 // x9 = temp buffer stride
812 function \txfm\()16_1d_8x16_pass2_neon
813 mov x14, x30
814 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
815 load \i, x2, x9
816 .endr
817 cbz x3, 1f
818 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
819 load \i, x2, x9
820 .endr
821 1:
822
823 add x3, x0, x1
824 lsl x1, x1, #1
825 bl \txfm\()16
826
827 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
828 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
829
830 br x14
831 endfunc
832 .endm
833
834 itxfm16_1d_funcs idct
835 itxfm16_1d_funcs iadst
836
837 .macro itxfm_func16x16 txfm1, txfm2
838 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
839 .ifc \txfm1\()_\txfm2,idct_idct
840 cmp w3, #1
841 b.eq idct16x16_dc_add_neon
842 .endif
843 mov x15, x30
844 // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
845 .ifnc \txfm1\()_\txfm2,idct_idct
846 stp d14, d15, [sp, #-0x10]!
847 stp d12, d13, [sp, #-0x10]!
848 stp d10, d11, [sp, #-0x10]!
849 stp d8, d9, [sp, #-0x10]!
850 .endif
851
852 sub sp, sp, #512
853
854 mov x4, x0
855 mov x5, x1
856 mov x6, x2
857
858 movrel x10, idct_coeffs
859 .ifnc \txfm1\()_\txfm2,idct_idct
860 movrel x11, iadst16_coeffs
861 .endif
862 .ifc \txfm1,idct
863 ld1 {v0.8h,v1.8h}, [x10]
864 .endif
865 mov x9, #32
866
867 .ifc \txfm1\()_\txfm2,idct_idct
868 cmp w3, #10
869 b.le idct16x16_quarter_add_neon
870 cmp w3, #38
871 b.le idct16x16_half_add_neon
872 .endif
873
874 .irp i, 0, 8
875 add x0, sp, #(\i*32)
876 .ifc \txfm1\()_\txfm2,idct_idct
877 .if \i == 8
878 cmp w3, #38
879 b.le 1f
880 .endif
881 .endif
882 mov x1, #\i
883 add x2, x6, #(\i*2)
884 bl \txfm1\()16_1d_8x16_pass1_neon
885 .endr
886 .ifc \txfm1\()_\txfm2,iadst_idct
887 ld1 {v0.8h,v1.8h}, [x10]
888 .endif
889
890 .ifc \txfm1\()_\txfm2,idct_idct
891 b 3f
892 1:
893 // Set v24-v31 to zero, for the in-register passthrough of
894 // coefficients to pass 2. Since we only do two slices, this can
895 // only ever happen for the second slice. So we only need to store
896 // zeros to the temp buffer for the second half of the buffer.
897 // Move x0 to the second half, and use x9 == 32 as increment.
898 add x0, x0, #16
899 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
900 movi_v \i, .16b, #0
901 st1 {v24.8h}, [x0], x9
902 .endr
903 3:
904 .endif
905
906 .irp i, 0, 8
907 add x0, x4, #(\i)
908 mov x1, x5
909 add x2, sp, #(\i*2)
910 mov x3, #\i
911 bl \txfm2\()16_1d_8x16_pass2_neon
912 .endr
913
914 add sp, sp, #512
915 .ifnc \txfm1\()_\txfm2,idct_idct
916 ldp d8, d9, [sp], 0x10
917 ldp d10, d11, [sp], 0x10
918 ldp d12, d13, [sp], 0x10
919 ldp d14, d15, [sp], 0x10
920 .endif
921 br x15
922 endfunc
923 .endm
924
925 itxfm_func16x16 idct, idct
926 itxfm_func16x16 iadst, idct
927 itxfm_func16x16 idct, iadst
928 itxfm_func16x16 iadst, iadst
929
930 function idct16_1d_8x16_pass1_quarter_neon
931 mov x14, x30
932 movi v2.8h, #0
933 .irp i, 16, 17, 18, 19
934 load_clear \i, x2, x9
935 .endr
936
937 bl idct16_quarter
938
939 // Do two 8x8 transposes. Originally, v16-v31 contain the
940 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
941 // transposed 8x8 blocks.
942 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
943 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
944
945 // Store the transposed 8x8 blocks horizontally.
946 // The first 8x8 block is kept in registers for the second pass,
947 // store the rest in the temp buffer.
948 // Since only a 4x4 part of the input was nonzero, this means that
949 // only 4 rows are nonzero after transposing, and the second pass
950 // only reads the topmost 4 rows. Therefore only store the topmost
951 // 4 rows.
952 add x0, x0, #16
953 .irp i, 24, 25, 26, 27
954 store \i, x0, x9
955 .endr
956 br x14
957 endfunc
958
959 function idct16_1d_8x16_pass2_quarter_neon
960 mov x14, x30
961 cbz x3, 1f
962 .irp i, 16, 17, 18, 19
963 load \i, x2, x9
964 .endr
965 1:
966
967 add x3, x0, x1
968 lsl x1, x1, #1
969 bl idct16_quarter
970
971 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
972 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
973
974 br x14
975 endfunc
976
977 function idct16_1d_8x16_pass1_half_neon
978 mov x14, x30
979 movi v2.8h, #0
980 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
981 load_clear \i, x2, x9
982 .endr
983
984 bl idct16_half
985
986 // Do two 8x8 transposes. Originally, v16-v31 contain the
987 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
988 // transposed 8x8 blocks.
989 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
990 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
991
992 // Store the transposed 8x8 blocks horizontally.
993 // The first 8x8 block is kept in registers for the second pass,
994 // store the rest in the temp buffer.
995 add x0, x0, #16
996 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
997 store \i, x0, x9
998 .endr
999 br x14
1000 endfunc
1001
1002 function idct16_1d_8x16_pass2_half_neon
1003 mov x14, x30
1004 cbz x3, 1f
1005 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1006 load \i, x2, x9
1007 .endr
1008 1:
1009
1010 add x3, x0, x1
1011 lsl x1, x1, #1
1012 bl idct16_half
1013
1014 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
1015 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
1016
1017 br x14
1018 endfunc
1019
1020 .macro idct16_partial size
1021 function idct16x16_\size\()_add_neon
1022 add x0, sp, #(0*32)
1023 add x2, x6, #(0*2)
1024 bl idct16_1d_8x16_pass1_\size\()_neon
1025 .irp i, 0, 8
1026 add x0, x4, #(\i)
1027 mov x1, x5
1028 add x2, sp, #(\i*2)
1029 mov x3, #\i
1030 bl idct16_1d_8x16_pass2_\size\()_neon
1031 .endr
1032
1033 add sp, sp, #512
1034 br x15
1035 endfunc
1036 .endm
1037
1038 idct16_partial quarter
1039 idct16_partial half
1040
1041 function idct32x32_dc_add_neon
1042 movrel x4, idct_coeffs
1043 ld1 {v0.4h}, [x4]
1044
1045 movi v1.4h, #0
1046
1047 ld1 {v2.h}[0], [x2]
1048 smull v2.4s, v2.4h, v0.h[0]
1049 rshrn v2.4h, v2.4s, #14
1050 smull v2.4s, v2.4h, v0.h[0]
1051 rshrn v2.4h, v2.4s, #14
1052 dup v2.8h, v2.h[0]
1053 st1 {v1.h}[0], [x2]
1054
1055 srshr v0.8h, v2.8h, #6
1056
1057 mov x4, #32
1058 1:
1059 // Loop to add the constant v0 into all 32x32 outputs
1060 ld1 {v1.16b,v2.16b}, [x0]
1061 uaddw v3.8h, v0.8h, v1.8b
1062 uaddw2 v4.8h, v0.8h, v1.16b
1063 uaddw v5.8h, v0.8h, v2.8b
1064 uaddw2 v6.8h, v0.8h, v2.16b
1065 sqxtun v3.8b, v3.8h
1066 sqxtun2 v3.16b, v4.8h
1067 sqxtun v4.8b, v5.8h
1068 sqxtun2 v4.16b, v6.8h
1069 st1 {v3.16b,v4.16b}, [x0], x1
1070 subs x4, x4, #1
1071 b.ne 1b
1072
1073 ret
1074 endfunc
1075
1076 .macro idct32_end
1077 butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a
1078 butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18
1079 butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a
1080 butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21
1081 butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a
1082 butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26
1083 butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a
1084 butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29
1085
1086 dmbutterfly v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
1087 dmbutterfly v3, v5, v0.h[1], v0.h[2], v24, v25, v30, v31 // v3 = t19, v5 = t28
1088 dmbutterfly v28, v6, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
1089 dmbutterfly v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1090
1091 butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24
1092 butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1093 butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16
1094 butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1095 butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21
1096 butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
1097 butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26
1098 butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20
1099
1100 dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20
1101 dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
1102 dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22
1103 dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
1104 ret
1105 .endm
1106
1107 function idct32_odd
1108 ld1 {v0.8h,v1.8h}, [x11]
1109
1110 dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1111 dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1112 dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1113 dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1114 dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1115 dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1116 dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1117 dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1118
1119 ld1 {v0.8h}, [x10]
1120
1121 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
1122 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
1123 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
1124 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
1125 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
1126 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
1127 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
1128 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
1129
1130 dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1131 dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1132 dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1133 dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1134 idct32_end
1135 endfunc
1136
1137 function idct32_odd_half
1138 ld1 {v0.8h,v1.8h}, [x11]
1139
1140 dmbutterfly_h1 v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1141 dmbutterfly_h2 v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1142 dmbutterfly_h1 v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1143 dmbutterfly_h2 v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1144 dmbutterfly_h1 v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1145 dmbutterfly_h2 v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1146 dmbutterfly_h1 v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1147 dmbutterfly_h2 v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1148
1149 ld1 {v0.8h}, [x10]
1150
1151 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
1152 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
1153 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
1154 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
1155 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
1156 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
1157 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
1158 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
1159
1160 dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1161 dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1162 dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1163 dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1164 idct32_end
1165 endfunc
1166
1167 function idct32_odd_quarter
1168 ld1 {v0.8h,v1.8h}, [x11]
1169
1170 dsmull_h v4, v5, v16, v0.h[0]
1171 dsmull_h v28, v29, v19, v0.h[7]
1172 dsmull_h v30, v31, v16, v0.h[1]
1173 dsmull_h v22, v23, v17, v1.h[6]
1174 dsmull_h v7, v6, v17, v1.h[7]
1175 dsmull_h v26, v27, v19, v0.h[6]
1176 dsmull_h v20, v21, v18, v1.h[0]
1177 dsmull_h v24, v25, v18, v1.h[1]
1178
1179 ld1 {v0.8h}, [x10]
1180
1181 neg v28.4s, v28.4s
1182 neg v29.4s, v29.4s
1183 neg v7.4s, v7.4s
1184 neg v6.4s, v6.4s
1185
1186 drshrn_h v4, v4, v5, #14
1187 drshrn_h v5, v28, v29, #14
1188 drshrn_h v29, v30, v31, #14
1189 drshrn_h v28, v22, v23, #14
1190 drshrn_h v7, v7, v6, #14
1191 drshrn_h v31, v26, v27, #14
1192 drshrn_h v6, v20, v21, #14
1193 drshrn_h v30, v24, v25, #14
1194
1195 dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[3], v0.h[4]
1196 dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[3], v0.h[4]
1197 drshrn_h v23, v16, v17, #14
1198 drshrn_h v24, v18, v19, #14
1199 neg v20.4s, v20.4s
1200 neg v21.4s, v21.4s
1201 drshrn_h v27, v27, v26, #14
1202 drshrn_h v20, v20, v21, #14
1203 dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[5], v0.h[6]
1204 drshrn_h v21, v16, v17, #14
1205 drshrn_h v26, v18, v19, #14
1206 dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[5], v0.h[6]
1207 drshrn_h v25, v16, v17, #14
1208 neg v18.4s, v18.4s
1209 neg v19.4s, v19.4s
1210 drshrn_h v22, v18, v19, #14
1211
1212 idct32_end
1213 endfunc
1214
1215 .macro idct32_funcs suffix
1216 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
1217 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1218 // a normal IDCT16 with every other input component (the even ones, with
1219 // each output written twice), followed by a separate 16-point IDCT
1220 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1221 // x0 = dst (temp buffer)
1222 // x1 = unused
1223 // x2 = src
1224 // x9 = double input stride
1225 // x10 = idct_coeffs
1226 // x11 = idct_coeffs + 32
1227 function idct32_1d_8x32_pass1\suffix\()_neon
1228 mov x14, x30
1229 ld1 {v0.8h,v1.8h}, [x10]
1230
1231 movi v2.8h, #0
1232
1233 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1234 .ifb \suffix
1235 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1236 load_clear \i, x2, x9
1237 .endr
1238 .endif
1239 .ifc \suffix,_quarter
1240 .irp i, 16, 17, 18, 19
1241 load_clear \i, x2, x9
1242 .endr
1243 .endif
1244 .ifc \suffix,_half
1245 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1246 load_clear \i, x2, x9
1247 .endr
1248 .endif
1249
1250 bl idct16\suffix
1251
1252 // Do two 8x8 transposes. Originally, v16-v31 contain the
1253 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
1254 // two transposed 8x8 blocks.
1255 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
1256 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
1257
1258 // Store the registers a, b horizontally, followed by the
1259 // same registers b, a mirrored.
1260 .macro store_rev a, b
1261 // There's no rev128 instruction, but we reverse each 64 bit
1262 // half, and then flip them using an ext with 8 bytes offset.
1263 rev64 v1.8h, \b
1264 st1 {\a}, [x0], #16
1265 rev64 v0.8h, \a
1266 ext v1.16b, v1.16b, v1.16b, #8
1267 st1 {\b}, [x0], #16
1268 ext v0.16b, v0.16b, v0.16b, #8
1269 st1 {v1.8h}, [x0], #16
1270 st1 {v0.8h}, [x0], #16
1271 .endm
1272 store_rev v16.8h, v24.8h
1273 store_rev v17.8h, v25.8h
1274 store_rev v18.8h, v26.8h
1275 store_rev v19.8h, v27.8h
1276 store_rev v20.8h, v28.8h
1277 store_rev v21.8h, v29.8h
1278 store_rev v22.8h, v30.8h
1279 store_rev v23.8h, v31.8h
1280 sub x0, x0, #512
1281 .purgem store_rev
1282
1283 // Move x2 back to the start of the input, and move
1284 // to the first odd row
1285 .ifb \suffix
1286 sub x2, x2, x9, lsl #4
1287 .endif
1288 .ifc \suffix,_quarter
1289 sub x2, x2, x9, lsl #2
1290 .endif
1291 .ifc \suffix,_half
1292 sub x2, x2, x9, lsl #3
1293 .endif
1294 add x2, x2, #64
1295
1296 movi v2.8h, #0
1297 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1298 .ifb \suffix
1299 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1300 load_clear \i, x2, x9
1301 .endr
1302 .endif
1303 .ifc \suffix,_quarter
1304 .irp i, 16, 17, 18, 19
1305 load_clear \i, x2, x9
1306 .endr
1307 .endif
1308 .ifc \suffix,_half
1309 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1310 load_clear \i, x2, x9
1311 .endr
1312 .endif
1313
1314 bl idct32_odd\suffix
1315
1316 transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
1317 transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
1318
1319 // Store the registers a, b horizontally,
1320 // adding into the output first, and the mirrored,
1321 // subtracted from the output.
1322 .macro store_rev a, b
1323 ld1 {v4.8h}, [x0]
1324 rev64 v1.8h, \b
1325 add v4.8h, v4.8h, \a
1326 rev64 v0.8h, \a
1327 st1 {v4.8h}, [x0], #16
1328 ext v1.16b, v1.16b, v1.16b, #8
1329 ld1 {v5.8h}, [x0]
1330 ext v0.16b, v0.16b, v0.16b, #8
1331 add v5.8h, v5.8h, \b
1332 st1 {v5.8h}, [x0], #16
1333 ld1 {v6.8h}, [x0]
1334 sub v6.8h, v6.8h, v1.8h
1335 st1 {v6.8h}, [x0], #16
1336 ld1 {v7.8h}, [x0]
1337 sub v7.8h, v7.8h, v0.8h
1338 st1 {v7.8h}, [x0], #16
1339 .endm
1340
1341 store_rev v31.8h, v23.8h
1342 store_rev v30.8h, v22.8h
1343 store_rev v29.8h, v21.8h
1344 store_rev v28.8h, v20.8h
1345 store_rev v27.8h, v19.8h
1346 store_rev v26.8h, v18.8h
1347 store_rev v25.8h, v17.8h
1348 store_rev v24.8h, v16.8h
1349 .purgem store_rev
1350 br x14
1351 endfunc
1352
1353 // This is mostly the same as 8x32_pass1, but without the transpose,
1354 // and use the source as temp buffer between the two idct passes, and
1355 // add into the destination.
1356 // x0 = dst
1357 // x1 = dst stride
1358 // x2 = src (temp buffer)
1359 // x7 = negative double temp buffer stride
1360 // x9 = double temp buffer stride
1361 // x10 = idct_coeffs
1362 // x11 = idct_coeffs + 32
1363 function idct32_1d_8x32_pass2\suffix\()_neon
1364 mov x14, x30
1365 ld1 {v0.8h,v1.8h}, [x10]
1366
1367 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1368 .ifb \suffix
1369 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1370 load \i, x2, x9
1371 .endr
1372 sub x2, x2, x9, lsl #4
1373 .endif
1374 .ifc \suffix,_quarter
1375 .irp i, 16, 17, 18, 19
1376 load \i, x2, x9
1377 .endr
1378 sub x2, x2, x9, lsl #2
1379 .endif
1380 .ifc \suffix,_half
1381 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1382 load \i, x2, x9
1383 .endr
1384 sub x2, x2, x9, lsl #3
1385 .endif
1386
1387 bl idct16\suffix
1388
1389 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1390 store \i, x2, x9
1391 .endr
1392
1393 sub x2, x2, x9, lsl #4
1394 add x2, x2, #64
1395
1396 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1397 .ifb \suffix
1398 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1399 load \i, x2, x9
1400 .endr
1401 sub x2, x2, x9, lsl #4
1402 .endif
1403 .ifc \suffix,_quarter
1404 .irp i, 16, 17, 18, 19
1405 load \i, x2, x9
1406 .endr
1407 sub x2, x2, x9, lsl #2
1408 .endif
1409 .ifc \suffix,_half
1410 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1411 load \i, x2, x9
1412 .endr
1413 sub x2, x2, x9, lsl #3
1414 .endif
1415 sub x2, x2, #64
1416
1417 bl idct32_odd\suffix
1418
1419 .macro load_acc_store a, b, c, d, neg=0
1420 .if \neg == 0
1421 ld1 {v4.8h}, [x2], x9
1422 ld1 {v5.8h}, [x2], x9
1423 add v4.8h, v4.8h, \a
1424 ld1 {v6.8h}, [x2], x9
1425 add v5.8h, v5.8h, \b
1426 ld1 {v7.8h}, [x2], x9
1427 add v6.8h, v6.8h, \c
1428 add v7.8h, v7.8h, \d
1429 .else
1430 ld1 {v4.8h}, [x2], x7
1431 ld1 {v5.8h}, [x2], x7
1432 sub v4.8h, v4.8h, \a
1433 ld1 {v6.8h}, [x2], x7
1434 sub v5.8h, v5.8h, \b
1435 ld1 {v7.8h}, [x2], x7
1436 sub v6.8h, v6.8h, \c
1437 sub v7.8h, v7.8h, \d
1438 .endif
1439 ld1 {v0.8b}, [x0], x1
1440 ld1 {v1.8b}, [x0], x1
1441 srshr v4.8h, v4.8h, #6
1442 ld1 {v2.8b}, [x0], x1
1443 srshr v5.8h, v5.8h, #6
1444 uaddw v4.8h, v4.8h, v0.8b
1445 ld1 {v3.8b}, [x0], x1
1446 srshr v6.8h, v6.8h, #6
1447 uaddw v5.8h, v5.8h, v1.8b
1448 srshr v7.8h, v7.8h, #6
1449 sub x0, x0, x1, lsl #2
1450 uaddw v6.8h, v6.8h, v2.8b
1451 sqxtun v4.8b, v4.8h
1452 uaddw v7.8h, v7.8h, v3.8b
1453 sqxtun v5.8b, v5.8h
1454 st1 {v4.8b}, [x0], x1
1455 sqxtun v6.8b, v6.8h
1456 st1 {v5.8b}, [x0], x1
1457 sqxtun v7.8b, v7.8h
1458 st1 {v6.8b}, [x0], x1
1459 st1 {v7.8b}, [x0], x1
1460 .endm
1461 load_acc_store v31.8h, v30.8h, v29.8h, v28.8h
1462 load_acc_store v27.8h, v26.8h, v25.8h, v24.8h
1463 load_acc_store v23.8h, v22.8h, v21.8h, v20.8h
1464 load_acc_store v19.8h, v18.8h, v17.8h, v16.8h
1465 sub x2, x2, x9
1466 load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1
1467 load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1
1468 load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
1469 load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
1470 .purgem load_acc_store
1471 br x14
1472 endfunc
1473 .endm
1474
1475 idct32_funcs
1476 idct32_funcs _quarter
1477 idct32_funcs _half
1478
1479 const min_eob_idct_idct_32, align=4
1480 .short 0, 34, 135, 336
1481 endconst
1482
1483 function ff_vp9_idct_idct_32x32_add_neon, export=1
1484 cmp w3, #1
1485 b.eq idct32x32_dc_add_neon
1486
1487 movrel x10, idct_coeffs
1488 add x11, x10, #32
1489 movrel x12, min_eob_idct_idct_32, 2
1490
1491 mov x15, x30
1492
1493 stp d14, d15, [sp, #-0x10]!
1494 stp d12, d13, [sp, #-0x10]!
1495 stp d10, d11, [sp, #-0x10]!
1496 stp d8, d9, [sp, #-0x10]!
1497
1498 sub sp, sp, #2048
1499
1500 mov x4, x0
1501 mov x5, x1
1502 mov x6, x2
1503
1504 // Double stride of the input, since we only read every other line
1505 mov x9, #128
1506 neg x7, x9
1507
1508 cmp w3, #34
1509 b.le idct32x32_quarter_add_neon
1510 cmp w3, #135
1511 b.le idct32x32_half_add_neon
1512
1513 .irp i, 0, 8, 16, 24
1514 add x0, sp, #(\i*64)
1515 .if \i > 0
1516 ldrh w1, [x12], #2
1517 cmp w3, w1
1518 mov x1, #(32 - \i)/4
1519 b.le 1f
1520 .endif
1521 add x2, x6, #(\i*2)
1522 bl idct32_1d_8x32_pass1_neon
1523 .endr
1524 b 3f
1525
1526 1:
1527 // Write zeros to the temp buffer for pass 2
1528 movi v16.8h, #0
1529 movi v17.8h, #0
1530 movi v18.8h, #0
1531 movi v19.8h, #0
1532 2:
1533 subs x1, x1, #1
1534 .rept 4
1535 st1 {v16.8h-v19.8h}, [x0], #64
1536 .endr
1537 b.ne 2b
1538 3:
1539 .irp i, 0, 8, 16, 24
1540 add x0, x4, #(\i)
1541 mov x1, x5
1542 add x2, sp, #(\i*2)
1543 bl idct32_1d_8x32_pass2_neon
1544 .endr
1545
1546 add sp, sp, #2048
1547
1548 ldp d8, d9, [sp], 0x10
1549 ldp d10, d11, [sp], 0x10
1550 ldp d12, d13, [sp], 0x10
1551 ldp d14, d15, [sp], 0x10
1552
1553 br x15
1554 endfunc
1555
1556 .macro idct32_partial size
1557 function idct32x32_\size\()_add_neon
1558 add x0, sp, #(0*64)
1559 add x2, x6, #(0*2)
1560 bl idct32_1d_8x32_pass1_\size\()_neon
1561 .ifc \size,half
1562 add x0, sp, #(8*64)
1563 add x2, x6, #(8*2)
1564 bl idct32_1d_8x32_pass1_\size\()_neon
1565 .endif
1566 .irp i, 0, 8, 16, 24
1567 add x0, x4, #(\i)
1568 mov x1, x5
1569 add x2, sp, #(\i*2)
1570 bl idct32_1d_8x32_pass2_\size\()_neon
1571 .endr
1572
1573 add sp, sp, #2048
1574
1575 ldp d8, d9, [sp], 0x10
1576 ldp d10, d11, [sp], 0x10
1577 ldp d12, d13, [sp], 0x10
1578 ldp d14, d15, [sp], 0x10
1579
1580 br x15
1581 endfunc
1582 .endm
1583
1584 idct32_partial quarter
1585 idct32_partial half