aarch64: vp9itxfm: Make the larger core transforms standalone functions
[libav.git] / libavcodec / aarch64 / vp9itxfm_neon.S
1 /*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/aarch64/asm.S"
22 #include "neon.S"
23
24 const itxfm4_coeffs, align=4
25 .short 11585, 6270, 15137, 0
26 iadst4_coeffs:
27 .short 5283, 15212, 9929, 13377
28 endconst
29
30 const iadst8_coeffs, align=4
31 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32 idct_coeffs:
33 .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
34 .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
35 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37 endconst
38
39 const iadst16_coeffs, align=4
40 .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
41 .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
42 endconst
43
44 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
45 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
46 // in/out are .8h registers; this can do with 4 temp registers, but is
47 // more efficient if 6 temp registers are available.
48 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
49 .if \neg > 0
50 neg \tmp4\().4h, v0.4h
51 .endif
52 add \tmp1\().8h, \in1\().8h, \in2\().8h
53 sub \tmp2\().8h, \in1\().8h, \in2\().8h
54 .if \neg > 0
55 smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
56 smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
57 .else
58 smull \tmp3\().4s, \tmp1\().4h, v0.h[0]
59 smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0]
60 .endif
61 .ifb \tmp5
62 rshrn \out1\().4h, \tmp3\().4s, #14
63 rshrn2 \out1\().8h, \tmp4\().4s, #14
64 smull \tmp3\().4s, \tmp2\().4h, v0.h[0]
65 smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0]
66 rshrn \out2\().4h, \tmp3\().4s, #14
67 rshrn2 \out2\().8h, \tmp4\().4s, #14
68 .else
69 smull \tmp5\().4s, \tmp2\().4h, v0.h[0]
70 smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0]
71 rshrn \out1\().4h, \tmp3\().4s, #14
72 rshrn2 \out1\().8h, \tmp4\().4s, #14
73 rshrn \out2\().4h, \tmp5\().4s, #14
74 rshrn2 \out2\().8h, \tmp6\().4s, #14
75 .endif
76 .endm
77
78 // out1,out2 = in1 * coef1 - in2 * coef2
79 // out3,out4 = in1 * coef2 + in2 * coef1
80 // out are 4 x .4s registers, in are 2 x .8h registers
81 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
82 smull \out1\().4s, \in1\().4h, \coef1
83 smull2 \out2\().4s, \in1\().8h, \coef1
84 smull \out3\().4s, \in1\().4h, \coef2
85 smull2 \out4\().4s, \in1\().8h, \coef2
86 smlsl \out1\().4s, \in2\().4h, \coef2
87 smlsl2 \out2\().4s, \in2\().8h, \coef2
88 smlal \out3\().4s, \in2\().4h, \coef1
89 smlal2 \out4\().4s, \in2\().8h, \coef1
90 .endm
91
92 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
93 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
94 // inout are 2 x .8h registers
95 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
96 dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
97 .if \neg > 0
98 neg \tmp3\().4s, \tmp3\().4s
99 neg \tmp4\().4s, \tmp4\().4s
100 .endif
101 rshrn \inout1\().4h, \tmp1\().4s, #14
102 rshrn2 \inout1\().8h, \tmp2\().4s, #14
103 rshrn \inout2\().4h, \tmp3\().4s, #14
104 rshrn2 \inout2\().8h, \tmp4\().4s, #14
105 .endm
106
107 // out1 = in1 + in2
108 // out2 = in1 - in2
109 .macro butterfly_8h out1, out2, in1, in2
110 add \out1\().8h, \in1\().8h, \in2\().8h
111 sub \out2\().8h, \in1\().8h, \in2\().8h
112 .endm
113
114 // out1 = in1 - in2
115 // out2 = in1 + in2
116 .macro butterfly_8h_r out1, out2, in1, in2
117 sub \out1\().8h, \in1\().8h, \in2\().8h
118 add \out2\().8h, \in1\().8h, \in2\().8h
119 .endm
120
121 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
122 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
123 // out are 2 x .8h registers, in are 4 x .4s registers
124 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
125 add \tmp1\().4s, \in1\().4s, \in3\().4s
126 add \tmp2\().4s, \in2\().4s, \in4\().4s
127 sub \tmp3\().4s, \in1\().4s, \in3\().4s
128 sub \tmp4\().4s, \in2\().4s, \in4\().4s
129 rshrn \out1\().4h, \tmp1\().4s, #14
130 rshrn2 \out1\().8h, \tmp2\().4s, #14
131 rshrn \out2\().4h, \tmp3\().4s, #14
132 rshrn2 \out2\().8h, \tmp4\().4s, #14
133 .endm
134
135 .macro iwht4 c0, c1, c2, c3
136 add \c0\().4h, \c0\().4h, \c1\().4h
137 sub v17.4h, \c2\().4h, \c3\().4h
138 sub v16.4h, \c0\().4h, v17.4h
139 sshr v16.4h, v16.4h, #1
140 sub \c2\().4h, v16.4h, \c1\().4h
141 sub \c1\().4h, v16.4h, \c3\().4h
142 add \c3\().4h, v17.4h, \c2\().4h
143 sub \c0\().4h, \c0\().4h, \c1\().4h
144 .endm
145
146 .macro idct4 c0, c1, c2, c3
147 smull v22.4s, \c1\().4h, v0.h[2]
148 smull v20.4s, \c1\().4h, v0.h[1]
149 add v16.4h, \c0\().4h, \c2\().4h
150 sub v17.4h, \c0\().4h, \c2\().4h
151 smlal v22.4s, \c3\().4h, v0.h[1]
152 smull v18.4s, v16.4h, v0.h[0]
153 smull v19.4s, v17.4h, v0.h[0]
154 smlsl v20.4s, \c3\().4h, v0.h[2]
155 rshrn v22.4h, v22.4s, #14
156 rshrn v18.4h, v18.4s, #14
157 rshrn v19.4h, v19.4s, #14
158 rshrn v20.4h, v20.4s, #14
159 add \c0\().4h, v18.4h, v22.4h
160 sub \c3\().4h, v18.4h, v22.4h
161 add \c1\().4h, v19.4h, v20.4h
162 sub \c2\().4h, v19.4h, v20.4h
163 .endm
164
165 .macro iadst4 c0, c1, c2, c3
166 smull v16.4s, \c0\().4h, v0.h[4]
167 smlal v16.4s, \c2\().4h, v0.h[5]
168 smlal v16.4s, \c3\().4h, v0.h[6]
169 smull v17.4s, \c0\().4h, v0.h[6]
170 smlsl v17.4s, \c2\().4h, v0.h[4]
171 sub \c0\().4h, \c0\().4h, \c2\().4h
172 smlsl v17.4s, \c3\().4h, v0.h[5]
173 add \c0\().4h, \c0\().4h, \c3\().4h
174 smull v19.4s, \c1\().4h, v0.h[7]
175 smull v18.4s, \c0\().4h, v0.h[7]
176 add v20.4s, v16.4s, v19.4s
177 add v21.4s, v17.4s, v19.4s
178 rshrn \c0\().4h, v20.4s, #14
179 add v16.4s, v16.4s, v17.4s
180 rshrn \c1\().4h, v21.4s, #14
181 sub v16.4s, v16.4s, v19.4s
182 rshrn \c2\().4h, v18.4s, #14
183 rshrn \c3\().4h, v16.4s, #14
184 .endm
185
186 // The public functions in this file have got the following signature:
187 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
188
189 .macro itxfm_func4x4 txfm1, txfm2
190 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
191 .ifc \txfm1,\txfm2
192 .ifc \txfm1,idct
193 movrel x4, itxfm4_coeffs
194 ld1 {v0.4h}, [x4]
195 .endif
196 .ifc \txfm1,iadst
197 movrel x4, iadst4_coeffs
198 ld1 {v0.d}[1], [x4]
199 .endif
200 .else
201 movrel x4, itxfm4_coeffs
202 ld1 {v0.8h}, [x4]
203 .endif
204
205 movi v31.8h, #0
206 .ifc \txfm1\()_\txfm2,idct_idct
207 cmp w3, #1
208 b.ne 1f
209 // DC-only for idct/idct
210 ld1r {v2.4h}, [x2]
211 smull v2.4s, v2.4h, v0.h[0]
212 rshrn v2.4h, v2.4s, #14
213 smull v2.4s, v2.4h, v0.h[0]
214 rshrn v2.4h, v2.4s, #14
215 st1 {v31.h}[0], [x2]
216 dup v4.4h, v2.h[0]
217 mov v5.16b, v4.16b
218 mov v6.16b, v4.16b
219 mov v7.16b, v4.16b
220 b 2f
221 .endif
222
223 1:
224 ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2]
225 st1 {v31.8h}, [x2], #16
226
227 .ifc \txfm1,iwht
228 sshr v4.4h, v4.4h, #2
229 sshr v5.4h, v5.4h, #2
230 sshr v6.4h, v6.4h, #2
231 sshr v7.4h, v7.4h, #2
232 .endif
233
234 \txfm1\()4 v4, v5, v6, v7
235
236 st1 {v31.8h}, [x2], #16
237 // Transpose 4x4 with 16 bit elements
238 transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19
239
240 \txfm2\()4 v4, v5, v6, v7
241 2:
242 ld1r {v0.2s}, [x0], x1
243 ld1r {v1.2s}, [x0], x1
244 .ifnc \txfm1,iwht
245 srshr v4.4h, v4.4h, #4
246 srshr v5.4h, v5.4h, #4
247 srshr v6.4h, v6.4h, #4
248 srshr v7.4h, v7.4h, #4
249 .endif
250 uaddw v4.8h, v4.8h, v0.8b
251 uaddw v5.8h, v5.8h, v1.8b
252 ld1r {v2.2s}, [x0], x1
253 ld1r {v3.2s}, [x0], x1
254 sqxtun v0.8b, v4.8h
255 sqxtun v1.8b, v5.8h
256 sub x0, x0, x1, lsl #2
257
258 uaddw v6.8h, v6.8h, v2.8b
259 uaddw v7.8h, v7.8h, v3.8b
260 st1 {v0.s}[0], [x0], x1
261 sqxtun v2.8b, v6.8h
262 sqxtun v3.8b, v7.8h
263
264 st1 {v1.s}[0], [x0], x1
265 st1 {v2.s}[0], [x0], x1
266 st1 {v3.s}[0], [x0], x1
267
268 ret
269 endfunc
270 .endm
271
272 itxfm_func4x4 idct, idct
273 itxfm_func4x4 iadst, idct
274 itxfm_func4x4 idct, iadst
275 itxfm_func4x4 iadst, iadst
276 itxfm_func4x4 iwht, iwht
277
278
279 .macro idct8
280 dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
281 dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
282 dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
283 dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
284
285 butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3
286 butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a
287 butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a
288 butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2
289
290 dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
291
292 butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
293 butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
294 butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
295 butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
296 .endm
297
298 .macro iadst8
299 dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a
300 dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a
301 dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a
302 dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a
303
304 dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4
305 dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5
306 dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6
307 dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7
308
309 butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2
310 butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3
311 neg v23.8h, v23.8h // v23 = out[7]
312
313 dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4]
314 neg v19.8h, v19.8h // v19 = out[3]
315
316 dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[1], v0.h[2] // v26,v27 = t5a, v28,v29 = t4a
317 dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[2], v0.h[1] // v2,v3 = t6a, v4,v5 = t7a
318
319 dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6
320 dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7
321 neg v17.8h, v17.8h // v17 = out[1]
322
323 dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5]
324 neg v21.8h, v21.8h // v21 = out[5]
325 .endm
326
327
328 .macro itxfm_func8x8 txfm1, txfm2
329 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
330 // The iadst also uses a few coefficients from
331 // idct, so those always need to be loaded.
332 .ifc \txfm1\()_\txfm2,idct_idct
333 movrel x4, idct_coeffs
334 ld1 {v0.8h}, [x4]
335 .else
336 movrel x4, iadst8_coeffs
337 ld1 {v1.8h}, [x4], #16
338 ld1 {v0.8h}, [x4]
339 .endif
340
341 movi v2.16b, #0
342 movi v3.16b, #0
343 movi v4.16b, #0
344 movi v5.16b, #0
345
346 .ifc \txfm1\()_\txfm2,idct_idct
347 cmp w3, #1
348 b.ne 1f
349 // DC-only for idct/idct
350 ld1r {v2.4h}, [x2]
351 smull v2.4s, v2.4h, v0.h[0]
352 rshrn v2.4h, v2.4s, #14
353 smull v2.4s, v2.4h, v0.h[0]
354 rshrn v2.4h, v2.4s, #14
355 st1 {v3.h}[0], [x2]
356 dup v16.8h, v2.h[0]
357 mov v17.16b, v16.16b
358 mov v18.16b, v16.16b
359 mov v19.16b, v16.16b
360 mov v20.16b, v16.16b
361 mov v21.16b, v16.16b
362 mov v22.16b, v16.16b
363 mov v23.16b, v16.16b
364 b 2f
365 .endif
366 1:
367 ld1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x2], #64
368 ld1 {v20.16b,v21.16b,v22.16b,v23.16b}, [x2], #64
369 sub x2, x2, #128
370 st1 {v2.16b,v3.16b,v4.16b,v5.16b}, [x2], #64
371 st1 {v2.16b,v3.16b,v4.16b,v5.16b}, [x2], #64
372
373 \txfm1\()8
374
375 // Transpose 8x8 with 16 bit elements
376 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
377
378 \txfm2\()8
379 2:
380 mov x3, x0
381 // Add into the destination
382 ld1 {v0.8b}, [x0], x1
383 srshr v16.8h, v16.8h, #5
384 ld1 {v1.8b}, [x0], x1
385 srshr v17.8h, v17.8h, #5
386 ld1 {v2.8b}, [x0], x1
387 srshr v18.8h, v18.8h, #5
388 uaddw v16.8h, v16.8h, v0.8b
389 ld1 {v3.8b}, [x0], x1
390 srshr v19.8h, v19.8h, #5
391 uaddw v17.8h, v17.8h, v1.8b
392 ld1 {v4.8b}, [x0], x1
393 srshr v20.8h, v20.8h, #5
394 uaddw v18.8h, v18.8h, v2.8b
395 sqxtun v0.8b, v16.8h
396 ld1 {v5.8b}, [x0], x1
397 srshr v21.8h, v21.8h, #5
398 uaddw v19.8h, v19.8h, v3.8b
399 sqxtun v1.8b, v17.8h
400 ld1 {v6.8b}, [x0], x1
401 srshr v22.8h, v22.8h, #5
402 uaddw v20.8h, v20.8h, v4.8b
403 sqxtun v2.8b, v18.8h
404 ld1 {v7.8b}, [x0], x1
405 srshr v23.8h, v23.8h, #5
406 uaddw v21.8h, v21.8h, v5.8b
407 sqxtun v3.8b, v19.8h
408
409 st1 {v0.8b}, [x3], x1
410 uaddw v22.8h, v22.8h, v6.8b
411 st1 {v1.8b}, [x3], x1
412 sqxtun v4.8b, v20.8h
413 st1 {v2.8b}, [x3], x1
414 uaddw v23.8h, v23.8h, v7.8b
415 st1 {v3.8b}, [x3], x1
416 sqxtun v5.8b, v21.8h
417 st1 {v4.8b}, [x3], x1
418 sqxtun v6.8b, v22.8h
419 st1 {v5.8b}, [x3], x1
420 sqxtun v7.8b, v23.8h
421
422 st1 {v6.8b}, [x3], x1
423 st1 {v7.8b}, [x3], x1
424
425 ret
426 endfunc
427 .endm
428
429 itxfm_func8x8 idct, idct
430 itxfm_func8x8 iadst, idct
431 itxfm_func8x8 idct, iadst
432 itxfm_func8x8 iadst, iadst
433
434
435 function idct16x16_dc_add_neon
436 movrel x4, idct_coeffs
437 ld1 {v0.4h}, [x4]
438
439 movi v1.4h, #0
440
441 ld1r {v2.4h}, [x2]
442 smull v2.4s, v2.4h, v0.h[0]
443 rshrn v2.4h, v2.4s, #14
444 smull v2.4s, v2.4h, v0.h[0]
445 rshrn v2.4h, v2.4s, #14
446 dup v2.8h, v2.h[0]
447 st1 {v1.h}[0], [x2]
448
449 srshr v2.8h, v2.8h, #6
450
451 mov x4, #16
452 1:
453 // Loop to add the constant from v2 into all 16x16 outputs
454 ld1 {v3.16b}, [x0]
455 uaddw v4.8h, v2.8h, v3.8b
456 uaddw2 v5.8h, v2.8h, v3.16b
457 sqxtun v4.8b, v4.8h
458 sqxtun2 v4.16b, v5.8h
459 st1 {v4.16b}, [x0], x1
460 subs x4, x4, #1
461 b.ne 1b
462
463 ret
464 endfunc
465
466 function idct16
467 dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
468 dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
469 dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
470 dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
471 dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
472 dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
473 dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
474 dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
475
476 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
477 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
478 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
479 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
480 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
481 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
482 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
483 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
484
485 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
486 dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
487 dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
488
489 butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a
490 butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6
491 butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5
492 butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4
493 butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a
494 butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10
495 butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13
496 butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a
497
498 dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a
499 dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
500
501 butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
502 butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
503 butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
504 butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
505 butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13]
506 butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
507 butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
508 butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10]
509 ret
510 endfunc
511
512 function iadst16
513 ld1 {v0.8h,v1.8h}, [x11]
514
515 dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0
516 dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.h[1], v1.h[0] // v10,v11 = t9, v8,v9 = t8
517 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
518 dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2
519 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
520
521 dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.h[3], v1.h[2] // v6,v7 = t11, v4,v5 = t10
522 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
523 dmbutterfly_l v10, v11, v8, v9, v27, v20, v0.h[5], v0.h[4] // v10,v11 = t5, v8,v9 = t4
524 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
525
526 dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12
527 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
528 dmbutterfly_l v6, v7, v4, v5, v25, v22, v0.h[7], v0.h[6] // v6,v7 = t7, v4,v5 = t6
529 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
530
531 dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14
532 ld1 {v0.8h}, [x10]
533 dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
534 dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4] // v14,v15 = t9, v12,v13 = t8
535 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
536
537 dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[4], v0.h[3] // v4,v5 = t12, v6,v7 = t13
538 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
539 dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[5], v0.h[6] // v10,v11 = t11, v8,v9 = t10
540 butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0
541 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
542
543 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5] // v12,v13 = t14, v14,v15 = t15
544 butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1
545 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
546 dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
547
548 butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2
549 butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3
550
551 dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[1], v0.h[2] // v10,v11 = t13, v8,v9 = t12
552 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1] // v12,v13 = t14, v14,v15 = t15
553
554 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
555 dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
556 neg v29.8h, v29.8h // v29 = out[13]
557
558 dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[1], v0.h[2] // v10,v11 = t5a, v8,v9 = t4a
559 dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[2], v0.h[1] // v12,v13 = t6a, v14,v15 = t7a
560
561 butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a
562 butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10
563
564 dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
565 neg v19.8h, v19.8h // v19 = out[3]
566 dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
567
568 butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
569 butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11
570
571 dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
572 dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
573 dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
574 dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
575
576 neg v31.8h, v5.8h // v31 = out[15]
577 neg v17.8h, v3.8h // v17 = out[1]
578
579 mov v16.16b, v2.16b
580 mov v30.16b, v4.16b
581 ret
582 endfunc
583
584 // Helper macros; we can't use these expressions directly within
585 // e.g. .irp due to the extra concatenation \(). Therefore wrap
586 // them in macros to allow using .irp below.
587 .macro load i, src, inc
588 ld1 {v\i\().8h}, [\src], \inc
589 .endm
590 .macro store i, dst, inc
591 st1 {v\i\().8h}, [\dst], \inc
592 .endm
593 .macro movi_v i, size, imm
594 movi v\i\()\size, \imm
595 .endm
596 .macro load_clear i, src, inc
597 ld1 {v\i\().8h}, [\src]
598 st1 {v2.8h}, [\src], \inc
599 .endm
600
601 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
602 // transpose into a horizontal 16x8 slice and store.
603 // x0 = dst (temp buffer)
604 // x1 = slice offset
605 // x2 = src
606 // x9 = input stride
607 .macro itxfm16_1d_funcs txfm
608 function \txfm\()16_1d_8x16_pass1_neon
609 mov x14, x30
610
611 movi v2.8h, #0
612 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
613 load_clear \i, x2, x9
614 .endr
615
616 bl \txfm\()16
617
618 // Do two 8x8 transposes. Originally, v16-v31 contain the
619 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
620 // transposed 8x8 blocks.
621 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
622 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
623
624 // Store the transposed 8x8 blocks horizontally.
625 cmp x1, #8
626 b.eq 1f
627 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
628 store \i, x0, #16
629 .endr
630 br x14
631 1:
632 // Special case: For the last input column (x1 == 8),
633 // which would be stored as the last row in the temp buffer,
634 // don't store the first 8x8 block, but keep it in registers
635 // for the first slice of the second pass (where it is the
636 // last 8x8 block).
637 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
638 add x0, x0, #16
639 store \i, x0, #16
640 .endr
641 mov v24.16b, v16.16b
642 mov v25.16b, v17.16b
643 mov v26.16b, v18.16b
644 mov v27.16b, v19.16b
645 mov v28.16b, v20.16b
646 mov v29.16b, v21.16b
647 mov v30.16b, v22.16b
648 mov v31.16b, v23.16b
649 br x14
650 endfunc
651
652 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
653 // load the destination pixels (from a similar 8x16 slice), add and store back.
654 // x0 = dst
655 // x1 = dst stride
656 // x2 = src (temp buffer)
657 // x3 = slice offset
658 // x9 = temp buffer stride
659 function \txfm\()16_1d_8x16_pass2_neon
660 mov x14, x30
661 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
662 load \i, x2, x9
663 .endr
664 cbz x3, 1f
665 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
666 load \i, x2, x9
667 .endr
668 1:
669
670 add x3, x0, x1
671 lsl x1, x1, #1
672 bl \txfm\()16
673
674 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
675 srshr \coef0, \coef0, #6
676 ld1 {v2.8b}, [x0], x1
677 srshr \coef1, \coef1, #6
678 ld1 {v3.8b}, [x3], x1
679 srshr \coef2, \coef2, #6
680 ld1 {v4.8b}, [x0], x1
681 srshr \coef3, \coef3, #6
682 uaddw \coef0, \coef0, v2.8b
683 ld1 {v5.8b}, [x3], x1
684 uaddw \coef1, \coef1, v3.8b
685 srshr \coef4, \coef4, #6
686 ld1 {v6.8b}, [x0], x1
687 srshr \coef5, \coef5, #6
688 ld1 {v7.8b}, [x3], x1
689 sqxtun v2.8b, \coef0
690 srshr \coef6, \coef6, #6
691 sqxtun v3.8b, \coef1
692 srshr \coef7, \coef7, #6
693 uaddw \coef2, \coef2, v4.8b
694 ld1 {\tmp1}, [x0], x1
695 uaddw \coef3, \coef3, v5.8b
696 ld1 {\tmp2}, [x3], x1
697 sqxtun v4.8b, \coef2
698 sub x0, x0, x1, lsl #2
699 sub x3, x3, x1, lsl #2
700 sqxtun v5.8b, \coef3
701 uaddw \coef4, \coef4, v6.8b
702 st1 {v2.8b}, [x0], x1
703 uaddw \coef5, \coef5, v7.8b
704 st1 {v3.8b}, [x3], x1
705 sqxtun v6.8b, \coef4
706 st1 {v4.8b}, [x0], x1
707 sqxtun v7.8b, \coef5
708 st1 {v5.8b}, [x3], x1
709 uaddw \coef6, \coef6, \tmp1
710 st1 {v6.8b}, [x0], x1
711 uaddw \coef7, \coef7, \tmp2
712 st1 {v7.8b}, [x3], x1
713 sqxtun \tmp1, \coef6
714 sqxtun \tmp2, \coef7
715 st1 {\tmp1}, [x0], x1
716 st1 {\tmp2}, [x3], x1
717 .endm
718 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
719 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
720 .purgem load_add_store
721
722 br x14
723 endfunc
724 .endm
725
726 itxfm16_1d_funcs idct
727 itxfm16_1d_funcs iadst
728
729 .macro itxfm_func16x16 txfm1, txfm2
730 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
731 .ifc \txfm1\()_\txfm2,idct_idct
732 cmp w3, #1
733 b.eq idct16x16_dc_add_neon
734 .endif
735 mov x15, x30
736 // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
737 .ifnc \txfm1\()_\txfm2,idct_idct
738 stp d14, d15, [sp, #-0x10]!
739 stp d12, d13, [sp, #-0x10]!
740 stp d10, d11, [sp, #-0x10]!
741 stp d8, d9, [sp, #-0x10]!
742 .endif
743
744 sub sp, sp, #512
745
746 mov x4, x0
747 mov x5, x1
748 mov x6, x2
749
750 movrel x10, idct_coeffs
751 .ifnc \txfm1\()_\txfm2,idct_idct
752 movrel x11, iadst16_coeffs
753 .endif
754 .ifc \txfm1,idct
755 ld1 {v0.8h,v1.8h}, [x10]
756 .endif
757 mov x9, #32
758
759 .irp i, 0, 8
760 add x0, sp, #(\i*32)
761 .ifc \txfm1\()_\txfm2,idct_idct
762 .if \i == 8
763 cmp w3, #38
764 b.le 1f
765 .endif
766 .endif
767 mov x1, #\i
768 add x2, x6, #(\i*2)
769 bl \txfm1\()16_1d_8x16_pass1_neon
770 .endr
771 .ifc \txfm1\()_\txfm2,iadst_idct
772 ld1 {v0.8h,v1.8h}, [x10]
773 .endif
774
775 .ifc \txfm1\()_\txfm2,idct_idct
776 b 3f
777 1:
778 // Set v24-v31 to zero, for the in-register passthrough of
779 // coefficients to pass 2. Since we only do two slices, this can
780 // only ever happen for the second slice. So we only need to store
781 // zeros to the temp buffer for the second half of the buffer.
782 // Move x0 to the second half, and use x9 == 32 as increment.
783 add x0, x0, #16
784 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
785 movi_v \i, .16b, #0
786 st1 {v24.8h}, [x0], x9
787 .endr
788 3:
789 .endif
790
791 .irp i, 0, 8
792 add x0, x4, #(\i)
793 mov x1, x5
794 add x2, sp, #(\i*2)
795 mov x3, #\i
796 bl \txfm2\()16_1d_8x16_pass2_neon
797 .endr
798
799 add sp, sp, #512
800 .ifnc \txfm1\()_\txfm2,idct_idct
801 ldp d8, d9, [sp], 0x10
802 ldp d10, d11, [sp], 0x10
803 ldp d12, d13, [sp], 0x10
804 ldp d14, d15, [sp], 0x10
805 .endif
806 br x15
807 endfunc
808 .endm
809
810 itxfm_func16x16 idct, idct
811 itxfm_func16x16 iadst, idct
812 itxfm_func16x16 idct, iadst
813 itxfm_func16x16 iadst, iadst
814
815
816 function idct32x32_dc_add_neon
817 movrel x4, idct_coeffs
818 ld1 {v0.4h}, [x4]
819
820 movi v1.4h, #0
821
822 ld1r {v2.4h}, [x2]
823 smull v2.4s, v2.4h, v0.h[0]
824 rshrn v2.4h, v2.4s, #14
825 smull v2.4s, v2.4h, v0.h[0]
826 rshrn v2.4h, v2.4s, #14
827 dup v2.8h, v2.h[0]
828 st1 {v1.h}[0], [x2]
829
830 srshr v0.8h, v2.8h, #6
831
832 mov x4, #32
833 1:
834 // Loop to add the constant v0 into all 32x32 outputs
835 ld1 {v1.16b,v2.16b}, [x0]
836 uaddw v3.8h, v0.8h, v1.8b
837 uaddw2 v4.8h, v0.8h, v1.16b
838 uaddw v5.8h, v0.8h, v2.8b
839 uaddw2 v6.8h, v0.8h, v2.16b
840 sqxtun v3.8b, v3.8h
841 sqxtun2 v3.16b, v4.8h
842 sqxtun v4.8b, v5.8h
843 sqxtun2 v4.16b, v6.8h
844 st1 {v3.16b,v4.16b}, [x0], x1
845 subs x4, x4, #1
846 b.ne 1b
847
848 ret
849 endfunc
850
851 function idct32_odd
852 ld1 {v0.8h,v1.8h}, [x11]
853
854 dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
855 dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
856 dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
857 dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
858 dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
859 dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
860 dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
861 dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
862
863 ld1 {v0.8h}, [x10]
864
865 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
866 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
867 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
868 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
869 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
870 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
871 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
872 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
873
874 dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
875 dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
876 dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
877 dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
878
879 butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a
880 butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18
881 butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a
882 butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21
883 butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a
884 butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26
885 butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a
886 butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29
887
888 dmbutterfly v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
889 dmbutterfly v3, v5, v0.h[1], v0.h[2], v24, v25, v30, v31 // v3 = t19, v5 = t28
890 dmbutterfly v28, v6, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
891 dmbutterfly v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
892
893 butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24
894 butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a
895 butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16
896 butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
897 butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21
898 butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
899 butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26
900 butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20
901
902 dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20
903 dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
904 dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22
905 dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
906 ret
907 endfunc
908
909 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
910 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
911 // a normal IDCT16 with every other input component (the even ones, with
912 // each output written twice), followed by a separate 16-point IDCT
913 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
914 // x0 = dst (temp buffer)
915 // x1 = unused
916 // x2 = src
917 // x9 = double input stride
918 // x10 = idct_coeffs
919 // x11 = idct_coeffs + 32
920 function idct32_1d_8x32_pass1_neon
921 mov x14, x30
922 ld1 {v0.8h,v1.8h}, [x10]
923
924 movi v4.8h, #0
925
926 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
927 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
928 ld1 {v\i\().8h}, [x2]
929 st1 {v4.8h}, [x2], x9
930 .endr
931
932 bl idct16
933
934 // Do two 8x8 transposes. Originally, v16-v31 contain the
935 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
936 // two transposed 8x8 blocks.
937 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
938 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
939
940 // Store the registers a, b horizontally, followed by the
941 // same registers b, a mirrored.
942 .macro store_rev a, b
943 // There's no rev128 instruction, but we reverse each 64 bit
944 // half, and then flip them using an ext with 8 bytes offset.
945 rev64 v1.8h, \b
946 st1 {\a}, [x0], #16
947 rev64 v0.8h, \a
948 ext v1.16b, v1.16b, v1.16b, #8
949 st1 {\b}, [x0], #16
950 ext v0.16b, v0.16b, v0.16b, #8
951 st1 {v1.8h}, [x0], #16
952 st1 {v0.8h}, [x0], #16
953 .endm
954 store_rev v16.8h, v24.8h
955 store_rev v17.8h, v25.8h
956 store_rev v18.8h, v26.8h
957 store_rev v19.8h, v27.8h
958 store_rev v20.8h, v28.8h
959 store_rev v21.8h, v29.8h
960 store_rev v22.8h, v30.8h
961 store_rev v23.8h, v31.8h
962 sub x0, x0, #512
963 .purgem store_rev
964
965 // Move x2 back to the start of the input, and move
966 // to the first odd row
967 sub x2, x2, x9, lsl #4
968 add x2, x2, #64
969
970 movi v4.8h, #0
971 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
972 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
973 ld1 {v\i\().8h}, [x2]
974 st1 {v4.8h}, [x2], x9
975 .endr
976
977 bl idct32_odd
978
979 transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
980 transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
981
982 // Store the registers a, b horizontally,
983 // adding into the output first, and the mirrored,
984 // subtracted from the output.
985 .macro store_rev a, b
986 ld1 {v4.8h}, [x0]
987 rev64 v1.8h, \b
988 add v4.8h, v4.8h, \a
989 rev64 v0.8h, \a
990 st1 {v4.8h}, [x0], #16
991 ext v1.16b, v1.16b, v1.16b, #8
992 ld1 {v5.8h}, [x0]
993 ext v0.16b, v0.16b, v0.16b, #8
994 add v5.8h, v5.8h, \b
995 st1 {v5.8h}, [x0], #16
996 ld1 {v6.8h}, [x0]
997 sub v6.8h, v6.8h, v1.8h
998 st1 {v6.8h}, [x0], #16
999 ld1 {v7.8h}, [x0]
1000 sub v7.8h, v7.8h, v0.8h
1001 st1 {v7.8h}, [x0], #16
1002 .endm
1003
1004 store_rev v31.8h, v23.8h
1005 store_rev v30.8h, v22.8h
1006 store_rev v29.8h, v21.8h
1007 store_rev v28.8h, v20.8h
1008 store_rev v27.8h, v19.8h
1009 store_rev v26.8h, v18.8h
1010 store_rev v25.8h, v17.8h
1011 store_rev v24.8h, v16.8h
1012 .purgem store_rev
1013 br x14
1014 endfunc
1015
1016 // This is mostly the same as 8x32_pass1, but without the transpose,
1017 // and use the source as temp buffer between the two idct passes, and
1018 // add into the destination.
1019 // x0 = dst
1020 // x1 = dst stride
1021 // x2 = src (temp buffer)
1022 // x7 = negative double temp buffer stride
1023 // x9 = double temp buffer stride
1024 // x10 = idct_coeffs
1025 // x11 = idct_coeffs + 32
1026 function idct32_1d_8x32_pass2_neon
1027 mov x14, x30
1028 ld1 {v0.8h,v1.8h}, [x10]
1029
1030 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1031 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1032 ld1 {v\i\().8h}, [x2], x9
1033 .endr
1034 sub x2, x2, x9, lsl #4
1035
1036 bl idct16
1037
1038 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1039 st1 {v\i\().8h}, [x2], x9
1040 .endr
1041
1042 sub x2, x2, x9, lsl #4
1043 add x2, x2, #64
1044
1045 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1046 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1047 ld1 {v\i\().8h}, [x2], x9
1048 .endr
1049 sub x2, x2, x9, lsl #4
1050 sub x2, x2, #64
1051
1052 bl idct32_odd
1053
1054 .macro load_acc_store a, b, c, d, neg=0
1055 .if \neg == 0
1056 ld1 {v4.8h}, [x2], x9
1057 ld1 {v5.8h}, [x2], x9
1058 add v4.8h, v4.8h, \a
1059 ld1 {v6.8h}, [x2], x9
1060 add v5.8h, v5.8h, \b
1061 ld1 {v7.8h}, [x2], x9
1062 add v6.8h, v6.8h, \c
1063 add v7.8h, v7.8h, \d
1064 .else
1065 ld1 {v4.8h}, [x2], x7
1066 ld1 {v5.8h}, [x2], x7
1067 sub v4.8h, v4.8h, \a
1068 ld1 {v6.8h}, [x2], x7
1069 sub v5.8h, v5.8h, \b
1070 ld1 {v7.8h}, [x2], x7
1071 sub v6.8h, v6.8h, \c
1072 sub v7.8h, v7.8h, \d
1073 .endif
1074 ld1 {v0.8b}, [x0], x1
1075 ld1 {v1.8b}, [x0], x1
1076 srshr v4.8h, v4.8h, #6
1077 ld1 {v2.8b}, [x0], x1
1078 srshr v5.8h, v5.8h, #6
1079 uaddw v4.8h, v4.8h, v0.8b
1080 ld1 {v3.8b}, [x0], x1
1081 srshr v6.8h, v6.8h, #6
1082 uaddw v5.8h, v5.8h, v1.8b
1083 srshr v7.8h, v7.8h, #6
1084 sub x0, x0, x1, lsl #2
1085 uaddw v6.8h, v6.8h, v2.8b
1086 sqxtun v4.8b, v4.8h
1087 uaddw v7.8h, v7.8h, v3.8b
1088 sqxtun v5.8b, v5.8h
1089 st1 {v4.8b}, [x0], x1
1090 sqxtun v6.8b, v6.8h
1091 st1 {v5.8b}, [x0], x1
1092 sqxtun v7.8b, v7.8h
1093 st1 {v6.8b}, [x0], x1
1094 st1 {v7.8b}, [x0], x1
1095 .endm
1096 load_acc_store v31.8h, v30.8h, v29.8h, v28.8h
1097 load_acc_store v27.8h, v26.8h, v25.8h, v24.8h
1098 load_acc_store v23.8h, v22.8h, v21.8h, v20.8h
1099 load_acc_store v19.8h, v18.8h, v17.8h, v16.8h
1100 sub x2, x2, x9
1101 load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1
1102 load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1
1103 load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
1104 load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
1105 .purgem load_acc_store
1106 br x14
1107 endfunc
1108
1109 const min_eob_idct_idct_32, align=4
1110 .short 0, 34, 135, 336
1111 endconst
1112
1113 function ff_vp9_idct_idct_32x32_add_neon, export=1
1114 cmp w3, #1
1115 b.eq idct32x32_dc_add_neon
1116
1117 movrel x10, idct_coeffs
1118 add x11, x10, #32
1119 movrel x12, min_eob_idct_idct_32, 2
1120
1121 mov x15, x30
1122
1123 stp d14, d15, [sp, #-0x10]!
1124 stp d12, d13, [sp, #-0x10]!
1125 stp d10, d11, [sp, #-0x10]!
1126 stp d8, d9, [sp, #-0x10]!
1127
1128 sub sp, sp, #2048
1129
1130 mov x4, x0
1131 mov x5, x1
1132 mov x6, x2
1133
1134 // Double stride of the input, since we only read every other line
1135 mov x9, #128
1136 neg x7, x9
1137
1138 .irp i, 0, 8, 16, 24
1139 add x0, sp, #(\i*64)
1140 .if \i > 0
1141 ldrh w1, [x12], #2
1142 cmp w3, w1
1143 mov x1, #(32 - \i)/4
1144 b.le 1f
1145 .endif
1146 add x2, x6, #(\i*2)
1147 bl idct32_1d_8x32_pass1_neon
1148 .endr
1149 b 3f
1150
1151 1:
1152 // Write zeros to the temp buffer for pass 2
1153 movi v16.8h, #0
1154 movi v17.8h, #0
1155 movi v18.8h, #0
1156 movi v19.8h, #0
1157 2:
1158 subs x1, x1, #1
1159 .rept 4
1160 st1 {v16.8h-v19.8h}, [x0], #64
1161 .endr
1162 b.ne 2b
1163 3:
1164 .irp i, 0, 8, 16, 24
1165 add x0, x4, #(\i)
1166 mov x1, x5
1167 add x2, sp, #(\i*2)
1168 bl idct32_1d_8x32_pass2_neon
1169 .endr
1170
1171 add sp, sp, #2048
1172
1173 ldp d8, d9, [sp], 0x10
1174 ldp d10, d11, [sp], 0x10
1175 ldp d12, d13, [sp], 0x10
1176 ldp d14, d15, [sp], 0x10
1177
1178 br x15
1179 endfunc