aarch64: vp9itxfm: Do separate functions for half/quarter idct16 and idct32
[libav.git] / libavcodec / aarch64 / vp9itxfm_neon.S
1 /*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/aarch64/asm.S"
22 #include "neon.S"
23
24 const itxfm4_coeffs, align=4
25 .short 11585, 6270, 15137, 0
26 iadst4_coeffs:
27 .short 5283, 15212, 9929, 13377
28 endconst
29
30 const iadst8_coeffs, align=4
31 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32 idct_coeffs:
33 .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
34 .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
35 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37 endconst
38
39 const iadst16_coeffs, align=4
40 .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
41 .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
42 endconst
43
44 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
45 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
46 // in/out are .8h registers; this can do with 4 temp registers, but is
47 // more efficient if 6 temp registers are available.
48 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
49 .if \neg > 0
50 neg \tmp4\().4h, v0.4h
51 .endif
52 add \tmp1\().8h, \in1\().8h, \in2\().8h
53 sub \tmp2\().8h, \in1\().8h, \in2\().8h
54 .if \neg > 0
55 smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
56 smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
57 .else
58 smull \tmp3\().4s, \tmp1\().4h, v0.h[0]
59 smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0]
60 .endif
61 .ifb \tmp5
62 rshrn \out1\().4h, \tmp3\().4s, #14
63 rshrn2 \out1\().8h, \tmp4\().4s, #14
64 smull \tmp3\().4s, \tmp2\().4h, v0.h[0]
65 smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0]
66 rshrn \out2\().4h, \tmp3\().4s, #14
67 rshrn2 \out2\().8h, \tmp4\().4s, #14
68 .else
69 smull \tmp5\().4s, \tmp2\().4h, v0.h[0]
70 smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0]
71 rshrn \out1\().4h, \tmp3\().4s, #14
72 rshrn2 \out1\().8h, \tmp4\().4s, #14
73 rshrn \out2\().4h, \tmp5\().4s, #14
74 rshrn2 \out2\().8h, \tmp6\().4s, #14
75 .endif
76 .endm
77
78 // Same as dmbutterfly0 above, but treating the input in in2 as zero,
79 // writing the same output into both out1 and out2.
80 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
81 smull \tmp1\().4s, \in1\().4h, v0.h[0]
82 smull2 \tmp2\().4s, \in1\().8h, v0.h[0]
83 rshrn \out1\().4h, \tmp1\().4s, #14
84 rshrn2 \out1\().8h, \tmp2\().4s, #14
85 rshrn \out2\().4h, \tmp1\().4s, #14
86 rshrn2 \out2\().8h, \tmp2\().4s, #14
87 .endm
88
89 // out1,out2 = in1 * coef1 - in2 * coef2
90 // out3,out4 = in1 * coef2 + in2 * coef1
91 // out are 4 x .4s registers, in are 2 x .8h registers
92 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
93 smull \out1\().4s, \in1\().4h, \coef1
94 smull2 \out2\().4s, \in1\().8h, \coef1
95 smull \out3\().4s, \in1\().4h, \coef2
96 smull2 \out4\().4s, \in1\().8h, \coef2
97 smlsl \out1\().4s, \in2\().4h, \coef2
98 smlsl2 \out2\().4s, \in2\().8h, \coef2
99 smlal \out3\().4s, \in2\().4h, \coef1
100 smlal2 \out4\().4s, \in2\().8h, \coef1
101 .endm
102
103 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
104 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
105 // inout are 2 x .8h registers
106 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
107 dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
108 .if \neg > 0
109 neg \tmp3\().4s, \tmp3\().4s
110 neg \tmp4\().4s, \tmp4\().4s
111 .endif
112 rshrn \inout1\().4h, \tmp1\().4s, #14
113 rshrn2 \inout1\().8h, \tmp2\().4s, #14
114 rshrn \inout2\().4h, \tmp3\().4s, #14
115 rshrn2 \inout2\().8h, \tmp4\().4s, #14
116 .endm
117
118 // Same as dmbutterfly above, but treating the input in inout2 as zero
119 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
120 smull \tmp1\().4s, \inout1\().4h, \coef1
121 smull2 \tmp2\().4s, \inout1\().8h, \coef1
122 smull \tmp3\().4s, \inout1\().4h, \coef2
123 smull2 \tmp4\().4s, \inout1\().8h, \coef2
124 rshrn \inout1\().4h, \tmp1\().4s, #14
125 rshrn2 \inout1\().8h, \tmp2\().4s, #14
126 rshrn \inout2\().4h, \tmp3\().4s, #14
127 rshrn2 \inout2\().8h, \tmp4\().4s, #14
128 .endm
129
130 // Same as dmbutterfly above, but treating the input in inout1 as zero
131 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
132 smull \tmp1\().4s, \inout2\().4h, \coef2
133 smull2 \tmp2\().4s, \inout2\().8h, \coef2
134 smull \tmp3\().4s, \inout2\().4h, \coef1
135 smull2 \tmp4\().4s, \inout2\().8h, \coef1
136 neg \tmp1\().4s, \tmp1\().4s
137 neg \tmp2\().4s, \tmp2\().4s
138 rshrn \inout2\().4h, \tmp3\().4s, #14
139 rshrn2 \inout2\().8h, \tmp4\().4s, #14
140 rshrn \inout1\().4h, \tmp1\().4s, #14
141 rshrn2 \inout1\().8h, \tmp2\().4s, #14
142 .endm
143
144 .macro dsmull_h out1, out2, in, coef
145 smull \out1\().4s, \in\().4h, \coef
146 smull2 \out2\().4s, \in\().8h, \coef
147 .endm
148
149 .macro drshrn_h out, in1, in2, shift
150 rshrn \out\().4h, \in1\().4s, \shift
151 rshrn2 \out\().8h, \in2\().4s, \shift
152 .endm
153
154
155 // out1 = in1 + in2
156 // out2 = in1 - in2
157 .macro butterfly_8h out1, out2, in1, in2
158 add \out1\().8h, \in1\().8h, \in2\().8h
159 sub \out2\().8h, \in1\().8h, \in2\().8h
160 .endm
161
162 // out1 = in1 - in2
163 // out2 = in1 + in2
164 .macro butterfly_8h_r out1, out2, in1, in2
165 sub \out1\().8h, \in1\().8h, \in2\().8h
166 add \out2\().8h, \in1\().8h, \in2\().8h
167 .endm
168
169 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
170 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
171 // out are 2 x .8h registers, in are 4 x .4s registers
172 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
173 add \tmp1\().4s, \in1\().4s, \in3\().4s
174 add \tmp2\().4s, \in2\().4s, \in4\().4s
175 sub \tmp3\().4s, \in1\().4s, \in3\().4s
176 sub \tmp4\().4s, \in2\().4s, \in4\().4s
177 rshrn \out1\().4h, \tmp1\().4s, #14
178 rshrn2 \out1\().8h, \tmp2\().4s, #14
179 rshrn \out2\().4h, \tmp3\().4s, #14
180 rshrn2 \out2\().8h, \tmp4\().4s, #14
181 .endm
182
183 .macro iwht4 c0, c1, c2, c3
184 add \c0\().4h, \c0\().4h, \c1\().4h
185 sub v17.4h, \c2\().4h, \c3\().4h
186 sub v16.4h, \c0\().4h, v17.4h
187 sshr v16.4h, v16.4h, #1
188 sub \c2\().4h, v16.4h, \c1\().4h
189 sub \c1\().4h, v16.4h, \c3\().4h
190 add \c3\().4h, v17.4h, \c2\().4h
191 sub \c0\().4h, \c0\().4h, \c1\().4h
192 .endm
193
194 .macro idct4 c0, c1, c2, c3
195 smull v22.4s, \c1\().4h, v0.h[2]
196 smull v20.4s, \c1\().4h, v0.h[1]
197 add v16.4h, \c0\().4h, \c2\().4h
198 sub v17.4h, \c0\().4h, \c2\().4h
199 smlal v22.4s, \c3\().4h, v0.h[1]
200 smull v18.4s, v16.4h, v0.h[0]
201 smull v19.4s, v17.4h, v0.h[0]
202 smlsl v20.4s, \c3\().4h, v0.h[2]
203 rshrn v22.4h, v22.4s, #14
204 rshrn v18.4h, v18.4s, #14
205 rshrn v19.4h, v19.4s, #14
206 rshrn v20.4h, v20.4s, #14
207 add \c0\().4h, v18.4h, v22.4h
208 sub \c3\().4h, v18.4h, v22.4h
209 add \c1\().4h, v19.4h, v20.4h
210 sub \c2\().4h, v19.4h, v20.4h
211 .endm
212
213 .macro iadst4 c0, c1, c2, c3
214 smull v16.4s, \c0\().4h, v0.h[4]
215 smlal v16.4s, \c2\().4h, v0.h[5]
216 smlal v16.4s, \c3\().4h, v0.h[6]
217 smull v17.4s, \c0\().4h, v0.h[6]
218 smlsl v17.4s, \c2\().4h, v0.h[4]
219 sub \c0\().4h, \c0\().4h, \c2\().4h
220 smlsl v17.4s, \c3\().4h, v0.h[5]
221 add \c0\().4h, \c0\().4h, \c3\().4h
222 smull v19.4s, \c1\().4h, v0.h[7]
223 smull v18.4s, \c0\().4h, v0.h[7]
224 add v20.4s, v16.4s, v19.4s
225 add v21.4s, v17.4s, v19.4s
226 rshrn \c0\().4h, v20.4s, #14
227 add v16.4s, v16.4s, v17.4s
228 rshrn \c1\().4h, v21.4s, #14
229 sub v16.4s, v16.4s, v19.4s
230 rshrn \c2\().4h, v18.4s, #14
231 rshrn \c3\().4h, v16.4s, #14
232 .endm
233
234 // The public functions in this file have got the following signature:
235 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
236
237 .macro itxfm_func4x4 txfm1, txfm2
238 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
239 .ifc \txfm1,\txfm2
240 .ifc \txfm1,idct
241 movrel x4, itxfm4_coeffs
242 ld1 {v0.4h}, [x4]
243 .endif
244 .ifc \txfm1,iadst
245 movrel x4, iadst4_coeffs
246 ld1 {v0.d}[1], [x4]
247 .endif
248 .else
249 movrel x4, itxfm4_coeffs
250 ld1 {v0.8h}, [x4]
251 .endif
252
253 movi v31.8h, #0
254 .ifc \txfm1\()_\txfm2,idct_idct
255 cmp w3, #1
256 b.ne 1f
257 // DC-only for idct/idct
258 ld1r {v2.4h}, [x2]
259 smull v2.4s, v2.4h, v0.h[0]
260 rshrn v2.4h, v2.4s, #14
261 smull v2.4s, v2.4h, v0.h[0]
262 rshrn v2.4h, v2.4s, #14
263 st1 {v31.h}[0], [x2]
264 dup v4.4h, v2.h[0]
265 mov v5.16b, v4.16b
266 mov v6.16b, v4.16b
267 mov v7.16b, v4.16b
268 b 2f
269 .endif
270
271 1:
272 ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2]
273 st1 {v31.8h}, [x2], #16
274
275 .ifc \txfm1,iwht
276 sshr v4.4h, v4.4h, #2
277 sshr v5.4h, v5.4h, #2
278 sshr v6.4h, v6.4h, #2
279 sshr v7.4h, v7.4h, #2
280 .endif
281
282 \txfm1\()4 v4, v5, v6, v7
283
284 st1 {v31.8h}, [x2], #16
285 // Transpose 4x4 with 16 bit elements
286 transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19
287
288 \txfm2\()4 v4, v5, v6, v7
289 2:
290 ld1r {v0.2s}, [x0], x1
291 ld1r {v1.2s}, [x0], x1
292 .ifnc \txfm1,iwht
293 srshr v4.4h, v4.4h, #4
294 srshr v5.4h, v5.4h, #4
295 srshr v6.4h, v6.4h, #4
296 srshr v7.4h, v7.4h, #4
297 .endif
298 uaddw v4.8h, v4.8h, v0.8b
299 uaddw v5.8h, v5.8h, v1.8b
300 ld1r {v2.2s}, [x0], x1
301 ld1r {v3.2s}, [x0], x1
302 sqxtun v0.8b, v4.8h
303 sqxtun v1.8b, v5.8h
304 sub x0, x0, x1, lsl #2
305
306 uaddw v6.8h, v6.8h, v2.8b
307 uaddw v7.8h, v7.8h, v3.8b
308 st1 {v0.s}[0], [x0], x1
309 sqxtun v2.8b, v6.8h
310 sqxtun v3.8b, v7.8h
311
312 st1 {v1.s}[0], [x0], x1
313 st1 {v2.s}[0], [x0], x1
314 st1 {v3.s}[0], [x0], x1
315
316 ret
317 endfunc
318 .endm
319
320 itxfm_func4x4 idct, idct
321 itxfm_func4x4 iadst, idct
322 itxfm_func4x4 idct, iadst
323 itxfm_func4x4 iadst, iadst
324 itxfm_func4x4 iwht, iwht
325
326
327 .macro idct8
328 dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
329 dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
330 dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
331 dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
332
333 butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3
334 butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a
335 butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a
336 butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2
337
338 dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
339
340 butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
341 butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
342 butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
343 butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
344 .endm
345
346 .macro iadst8
347 dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a
348 dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a
349 dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a
350 dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a
351
352 dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4
353 dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5
354 dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6
355 dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7
356
357 butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2
358 butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3
359 neg v23.8h, v23.8h // v23 = out[7]
360
361 dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4]
362 neg v19.8h, v19.8h // v19 = out[3]
363
364 dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[1], v0.h[2] // v26,v27 = t5a, v28,v29 = t4a
365 dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[2], v0.h[1] // v2,v3 = t6a, v4,v5 = t7a
366
367 dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6
368 dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7
369 neg v17.8h, v17.8h // v17 = out[1]
370
371 dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5]
372 neg v21.8h, v21.8h // v21 = out[5]
373 .endm
374
375
376 .macro itxfm_func8x8 txfm1, txfm2
377 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
378 // The iadst also uses a few coefficients from
379 // idct, so those always need to be loaded.
380 .ifc \txfm1\()_\txfm2,idct_idct
381 movrel x4, idct_coeffs
382 ld1 {v0.8h}, [x4]
383 .else
384 movrel x4, iadst8_coeffs
385 ld1 {v1.8h}, [x4], #16
386 ld1 {v0.8h}, [x4]
387 .endif
388
389 movi v2.16b, #0
390 movi v3.16b, #0
391 movi v4.16b, #0
392 movi v5.16b, #0
393
394 .ifc \txfm1\()_\txfm2,idct_idct
395 cmp w3, #1
396 b.ne 1f
397 // DC-only for idct/idct
398 ld1r {v2.4h}, [x2]
399 smull v2.4s, v2.4h, v0.h[0]
400 rshrn v2.4h, v2.4s, #14
401 smull v2.4s, v2.4h, v0.h[0]
402 rshrn v2.4h, v2.4s, #14
403 st1 {v3.h}[0], [x2]
404 dup v16.8h, v2.h[0]
405 mov v17.16b, v16.16b
406 mov v18.16b, v16.16b
407 mov v19.16b, v16.16b
408 mov v20.16b, v16.16b
409 mov v21.16b, v16.16b
410 mov v22.16b, v16.16b
411 mov v23.16b, v16.16b
412 b 2f
413 .endif
414 1:
415 ld1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x2], #64
416 ld1 {v20.16b,v21.16b,v22.16b,v23.16b}, [x2], #64
417 sub x2, x2, #128
418 st1 {v2.16b,v3.16b,v4.16b,v5.16b}, [x2], #64
419 st1 {v2.16b,v3.16b,v4.16b,v5.16b}, [x2], #64
420
421 \txfm1\()8
422
423 // Transpose 8x8 with 16 bit elements
424 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
425
426 \txfm2\()8
427 2:
428 mov x3, x0
429 // Add into the destination
430 ld1 {v0.8b}, [x0], x1
431 srshr v16.8h, v16.8h, #5
432 ld1 {v1.8b}, [x0], x1
433 srshr v17.8h, v17.8h, #5
434 ld1 {v2.8b}, [x0], x1
435 srshr v18.8h, v18.8h, #5
436 uaddw v16.8h, v16.8h, v0.8b
437 ld1 {v3.8b}, [x0], x1
438 srshr v19.8h, v19.8h, #5
439 uaddw v17.8h, v17.8h, v1.8b
440 ld1 {v4.8b}, [x0], x1
441 srshr v20.8h, v20.8h, #5
442 uaddw v18.8h, v18.8h, v2.8b
443 sqxtun v0.8b, v16.8h
444 ld1 {v5.8b}, [x0], x1
445 srshr v21.8h, v21.8h, #5
446 uaddw v19.8h, v19.8h, v3.8b
447 sqxtun v1.8b, v17.8h
448 ld1 {v6.8b}, [x0], x1
449 srshr v22.8h, v22.8h, #5
450 uaddw v20.8h, v20.8h, v4.8b
451 sqxtun v2.8b, v18.8h
452 ld1 {v7.8b}, [x0], x1
453 srshr v23.8h, v23.8h, #5
454 uaddw v21.8h, v21.8h, v5.8b
455 sqxtun v3.8b, v19.8h
456
457 st1 {v0.8b}, [x3], x1
458 uaddw v22.8h, v22.8h, v6.8b
459 st1 {v1.8b}, [x3], x1
460 sqxtun v4.8b, v20.8h
461 st1 {v2.8b}, [x3], x1
462 uaddw v23.8h, v23.8h, v7.8b
463 st1 {v3.8b}, [x3], x1
464 sqxtun v5.8b, v21.8h
465 st1 {v4.8b}, [x3], x1
466 sqxtun v6.8b, v22.8h
467 st1 {v5.8b}, [x3], x1
468 sqxtun v7.8b, v23.8h
469
470 st1 {v6.8b}, [x3], x1
471 st1 {v7.8b}, [x3], x1
472
473 ret
474 endfunc
475 .endm
476
477 itxfm_func8x8 idct, idct
478 itxfm_func8x8 iadst, idct
479 itxfm_func8x8 idct, iadst
480 itxfm_func8x8 iadst, iadst
481
482
483 function idct16x16_dc_add_neon
484 movrel x4, idct_coeffs
485 ld1 {v0.4h}, [x4]
486
487 movi v1.4h, #0
488
489 ld1r {v2.4h}, [x2]
490 smull v2.4s, v2.4h, v0.h[0]
491 rshrn v2.4h, v2.4s, #14
492 smull v2.4s, v2.4h, v0.h[0]
493 rshrn v2.4h, v2.4s, #14
494 dup v2.8h, v2.h[0]
495 st1 {v1.h}[0], [x2]
496
497 srshr v2.8h, v2.8h, #6
498
499 mov x4, #16
500 1:
501 // Loop to add the constant from v2 into all 16x16 outputs
502 ld1 {v3.16b}, [x0]
503 uaddw v4.8h, v2.8h, v3.8b
504 uaddw2 v5.8h, v2.8h, v3.16b
505 sqxtun v4.8b, v4.8h
506 sqxtun2 v4.16b, v5.8h
507 st1 {v4.16b}, [x0], x1
508 subs x4, x4, #1
509 b.ne 1b
510
511 ret
512 endfunc
513
514 .macro idct16_end
515 butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a
516 butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6
517 butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5
518 butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4
519 butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a
520 butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10
521 butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13
522 butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a
523
524 dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a
525 dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
526
527 butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
528 butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
529 butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
530 butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
531 butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13]
532 butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
533 butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
534 butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10]
535 ret
536 .endm
537
538 function idct16
539 dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
540 dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
541 dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
542 dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
543 dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
544 dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
545 dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
546 dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
547
548 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
549 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
550 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
551 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
552 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
553 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
554 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
555 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
556
557 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
558 dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
559 dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
560 idct16_end
561 endfunc
562
563 function idct16_half
564 dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
565 dmbutterfly_h1 v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
566 dmbutterfly_h1 v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
567 dmbutterfly_h2 v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
568 dmbutterfly_h1 v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
569 dmbutterfly_h2 v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
570 dmbutterfly_h1 v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
571 dmbutterfly_h2 v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
572
573 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
574 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
575 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
576 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
577 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
578 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
579 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
580 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
581
582 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
583 dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
584 dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
585 idct16_end
586 endfunc
587
588 function idct16_quarter
589 dsmull_h v24, v25, v19, v1.h[6]
590 dsmull_h v4, v5, v17, v0.h[7]
591 dsmull_h v7, v6, v18, v0.h[4]
592 dsmull_h v30, v31, v18, v0.h[3]
593 neg v24.4s, v24.4s
594 neg v25.4s, v25.4s
595 dsmull_h v29, v28, v17, v1.h[0]
596 dsmull_h v26, v27, v19, v1.h[5]
597 dsmull_h v22, v23, v16, v0.h[0]
598 drshrn_h v24, v24, v25, #14
599 drshrn_h v16, v4, v5, #14
600 drshrn_h v7, v7, v6, #14
601 drshrn_h v6, v30, v31, #14
602 drshrn_h v29, v29, v28, #14
603 drshrn_h v17, v26, v27, #14
604 drshrn_h v28, v22, v23, #14
605
606 dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2]
607 dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2]
608 neg v22.4s, v22.4s
609 neg v23.4s, v23.4s
610 drshrn_h v27, v20, v21, #14
611 drshrn_h v21, v22, v23, #14
612 drshrn_h v23, v18, v19, #14
613 drshrn_h v25, v30, v31, #14
614 mov v4.16b, v28.16b
615 mov v5.16b, v28.16b
616 dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
617 mov v20.16b, v28.16b
618 idct16_end
619 endfunc
620
621 function iadst16
622 ld1 {v0.8h,v1.8h}, [x11]
623
624 dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0
625 dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.h[1], v1.h[0] // v10,v11 = t9, v8,v9 = t8
626 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
627 dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2
628 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
629
630 dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.h[3], v1.h[2] // v6,v7 = t11, v4,v5 = t10
631 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
632 dmbutterfly_l v10, v11, v8, v9, v27, v20, v0.h[5], v0.h[4] // v10,v11 = t5, v8,v9 = t4
633 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
634
635 dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12
636 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
637 dmbutterfly_l v6, v7, v4, v5, v25, v22, v0.h[7], v0.h[6] // v6,v7 = t7, v4,v5 = t6
638 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
639
640 dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14
641 ld1 {v0.8h}, [x10]
642 dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
643 dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4] // v14,v15 = t9, v12,v13 = t8
644 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
645
646 dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[4], v0.h[3] // v4,v5 = t12, v6,v7 = t13
647 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
648 dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[5], v0.h[6] // v10,v11 = t11, v8,v9 = t10
649 butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0
650 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
651
652 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5] // v12,v13 = t14, v14,v15 = t15
653 butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1
654 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
655 dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
656
657 butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2
658 butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3
659
660 dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[1], v0.h[2] // v10,v11 = t13, v8,v9 = t12
661 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1] // v12,v13 = t14, v14,v15 = t15
662
663 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
664 dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
665 neg v29.8h, v29.8h // v29 = out[13]
666
667 dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[1], v0.h[2] // v10,v11 = t5a, v8,v9 = t4a
668 dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[2], v0.h[1] // v12,v13 = t6a, v14,v15 = t7a
669
670 butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a
671 butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10
672
673 dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
674 neg v19.8h, v19.8h // v19 = out[3]
675 dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
676
677 butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
678 butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11
679
680 dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
681 dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
682 dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
683 dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
684
685 neg v31.8h, v5.8h // v31 = out[15]
686 neg v17.8h, v3.8h // v17 = out[1]
687
688 mov v16.16b, v2.16b
689 mov v30.16b, v4.16b
690 ret
691 endfunc
692
693 // Helper macros; we can't use these expressions directly within
694 // e.g. .irp due to the extra concatenation \(). Therefore wrap
695 // them in macros to allow using .irp below.
696 .macro load i, src, inc
697 ld1 {v\i\().8h}, [\src], \inc
698 .endm
699 .macro store i, dst, inc
700 st1 {v\i\().8h}, [\dst], \inc
701 .endm
702 .macro movi_v i, size, imm
703 movi v\i\()\size, \imm
704 .endm
705 .macro load_clear i, src, inc
706 ld1 {v\i\().8h}, [\src]
707 st1 {v2.8h}, [\src], \inc
708 .endm
709
710 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
711 srshr \coef0, \coef0, #6
712 ld1 {v2.8b}, [x0], x1
713 srshr \coef1, \coef1, #6
714 ld1 {v3.8b}, [x3], x1
715 srshr \coef2, \coef2, #6
716 ld1 {v4.8b}, [x0], x1
717 srshr \coef3, \coef3, #6
718 uaddw \coef0, \coef0, v2.8b
719 ld1 {v5.8b}, [x3], x1
720 uaddw \coef1, \coef1, v3.8b
721 srshr \coef4, \coef4, #6
722 ld1 {v6.8b}, [x0], x1
723 srshr \coef5, \coef5, #6
724 ld1 {v7.8b}, [x3], x1
725 sqxtun v2.8b, \coef0
726 srshr \coef6, \coef6, #6
727 sqxtun v3.8b, \coef1
728 srshr \coef7, \coef7, #6
729 uaddw \coef2, \coef2, v4.8b
730 ld1 {\tmp1}, [x0], x1
731 uaddw \coef3, \coef3, v5.8b
732 ld1 {\tmp2}, [x3], x1
733 sqxtun v4.8b, \coef2
734 sub x0, x0, x1, lsl #2
735 sub x3, x3, x1, lsl #2
736 sqxtun v5.8b, \coef3
737 uaddw \coef4, \coef4, v6.8b
738 st1 {v2.8b}, [x0], x1
739 uaddw \coef5, \coef5, v7.8b
740 st1 {v3.8b}, [x3], x1
741 sqxtun v6.8b, \coef4
742 st1 {v4.8b}, [x0], x1
743 sqxtun v7.8b, \coef5
744 st1 {v5.8b}, [x3], x1
745 uaddw \coef6, \coef6, \tmp1
746 st1 {v6.8b}, [x0], x1
747 uaddw \coef7, \coef7, \tmp2
748 st1 {v7.8b}, [x3], x1
749 sqxtun \tmp1, \coef6
750 sqxtun \tmp2, \coef7
751 st1 {\tmp1}, [x0], x1
752 st1 {\tmp2}, [x3], x1
753 .endm
754
755 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
756 // transpose into a horizontal 16x8 slice and store.
757 // x0 = dst (temp buffer)
758 // x1 = slice offset
759 // x2 = src
760 // x9 = input stride
761 .macro itxfm16_1d_funcs txfm
762 function \txfm\()16_1d_8x16_pass1_neon
763 mov x14, x30
764
765 movi v2.8h, #0
766 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
767 load_clear \i, x2, x9
768 .endr
769
770 bl \txfm\()16
771
772 // Do two 8x8 transposes. Originally, v16-v31 contain the
773 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
774 // transposed 8x8 blocks.
775 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
776 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
777
778 // Store the transposed 8x8 blocks horizontally.
779 cmp x1, #8
780 b.eq 1f
781 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
782 store \i, x0, #16
783 .endr
784 br x14
785 1:
786 // Special case: For the last input column (x1 == 8),
787 // which would be stored as the last row in the temp buffer,
788 // don't store the first 8x8 block, but keep it in registers
789 // for the first slice of the second pass (where it is the
790 // last 8x8 block).
791 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
792 add x0, x0, #16
793 store \i, x0, #16
794 .endr
795 mov v24.16b, v16.16b
796 mov v25.16b, v17.16b
797 mov v26.16b, v18.16b
798 mov v27.16b, v19.16b
799 mov v28.16b, v20.16b
800 mov v29.16b, v21.16b
801 mov v30.16b, v22.16b
802 mov v31.16b, v23.16b
803 br x14
804 endfunc
805
806 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
807 // load the destination pixels (from a similar 8x16 slice), add and store back.
808 // x0 = dst
809 // x1 = dst stride
810 // x2 = src (temp buffer)
811 // x3 = slice offset
812 // x9 = temp buffer stride
813 function \txfm\()16_1d_8x16_pass2_neon
814 mov x14, x30
815 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
816 load \i, x2, x9
817 .endr
818 cbz x3, 1f
819 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
820 load \i, x2, x9
821 .endr
822 1:
823
824 add x3, x0, x1
825 lsl x1, x1, #1
826 bl \txfm\()16
827
828 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
829 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
830
831 br x14
832 endfunc
833 .endm
834
835 itxfm16_1d_funcs idct
836 itxfm16_1d_funcs iadst
837
838 .macro itxfm_func16x16 txfm1, txfm2
839 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
840 .ifc \txfm1\()_\txfm2,idct_idct
841 cmp w3, #1
842 b.eq idct16x16_dc_add_neon
843 .endif
844 mov x15, x30
845 // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
846 .ifnc \txfm1\()_\txfm2,idct_idct
847 stp d14, d15, [sp, #-0x10]!
848 stp d12, d13, [sp, #-0x10]!
849 stp d10, d11, [sp, #-0x10]!
850 stp d8, d9, [sp, #-0x10]!
851 .endif
852
853 sub sp, sp, #512
854
855 mov x4, x0
856 mov x5, x1
857 mov x6, x2
858
859 movrel x10, idct_coeffs
860 .ifnc \txfm1\()_\txfm2,idct_idct
861 movrel x11, iadst16_coeffs
862 .endif
863 .ifc \txfm1,idct
864 ld1 {v0.8h,v1.8h}, [x10]
865 .endif
866 mov x9, #32
867
868 .ifc \txfm1\()_\txfm2,idct_idct
869 cmp w3, #10
870 b.le idct16x16_quarter_add_neon
871 cmp w3, #38
872 b.le idct16x16_half_add_neon
873 .endif
874
875 .irp i, 0, 8
876 add x0, sp, #(\i*32)
877 .ifc \txfm1\()_\txfm2,idct_idct
878 .if \i == 8
879 cmp w3, #38
880 b.le 1f
881 .endif
882 .endif
883 mov x1, #\i
884 add x2, x6, #(\i*2)
885 bl \txfm1\()16_1d_8x16_pass1_neon
886 .endr
887 .ifc \txfm1\()_\txfm2,iadst_idct
888 ld1 {v0.8h,v1.8h}, [x10]
889 .endif
890
891 .ifc \txfm1\()_\txfm2,idct_idct
892 b 3f
893 1:
894 // Set v24-v31 to zero, for the in-register passthrough of
895 // coefficients to pass 2. Since we only do two slices, this can
896 // only ever happen for the second slice. So we only need to store
897 // zeros to the temp buffer for the second half of the buffer.
898 // Move x0 to the second half, and use x9 == 32 as increment.
899 add x0, x0, #16
900 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
901 movi_v \i, .16b, #0
902 st1 {v24.8h}, [x0], x9
903 .endr
904 3:
905 .endif
906
907 .irp i, 0, 8
908 add x0, x4, #(\i)
909 mov x1, x5
910 add x2, sp, #(\i*2)
911 mov x3, #\i
912 bl \txfm2\()16_1d_8x16_pass2_neon
913 .endr
914
915 add sp, sp, #512
916 .ifnc \txfm1\()_\txfm2,idct_idct
917 ldp d8, d9, [sp], 0x10
918 ldp d10, d11, [sp], 0x10
919 ldp d12, d13, [sp], 0x10
920 ldp d14, d15, [sp], 0x10
921 .endif
922 br x15
923 endfunc
924 .endm
925
926 itxfm_func16x16 idct, idct
927 itxfm_func16x16 iadst, idct
928 itxfm_func16x16 idct, iadst
929 itxfm_func16x16 iadst, iadst
930
931 function idct16_1d_8x16_pass1_quarter_neon
932 mov x14, x30
933 movi v2.8h, #0
934 .irp i, 16, 17, 18, 19
935 load_clear \i, x2, x9
936 .endr
937
938 bl idct16_quarter
939
940 // Do two 8x8 transposes. Originally, v16-v31 contain the
941 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
942 // transposed 8x8 blocks.
943 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
944 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
945
946 // Store the transposed 8x8 blocks horizontally.
947 // The first 8x8 block is kept in registers for the second pass,
948 // store the rest in the temp buffer.
949 // Since only a 4x4 part of the input was nonzero, this means that
950 // only 4 rows are nonzero after transposing, and the second pass
951 // only reads the topmost 4 rows. Therefore only store the topmost
952 // 4 rows.
953 add x0, x0, #16
954 .irp i, 24, 25, 26, 27
955 store \i, x0, x9
956 .endr
957 br x14
958 endfunc
959
960 function idct16_1d_8x16_pass2_quarter_neon
961 mov x14, x30
962 cbz x3, 1f
963 .irp i, 16, 17, 18, 19
964 load \i, x2, x9
965 .endr
966 1:
967
968 add x3, x0, x1
969 lsl x1, x1, #1
970 bl idct16_quarter
971
972 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
973 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
974
975 br x14
976 endfunc
977
978 function idct16_1d_8x16_pass1_half_neon
979 mov x14, x30
980 movi v2.8h, #0
981 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
982 load_clear \i, x2, x9
983 .endr
984
985 bl idct16_half
986
987 // Do two 8x8 transposes. Originally, v16-v31 contain the
988 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
989 // transposed 8x8 blocks.
990 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
991 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
992
993 // Store the transposed 8x8 blocks horizontally.
994 // The first 8x8 block is kept in registers for the second pass,
995 // store the rest in the temp buffer.
996 add x0, x0, #16
997 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
998 store \i, x0, x9
999 .endr
1000 br x14
1001 endfunc
1002
1003 function idct16_1d_8x16_pass2_half_neon
1004 mov x14, x30
1005 cbz x3, 1f
1006 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1007 load \i, x2, x9
1008 .endr
1009 1:
1010
1011 add x3, x0, x1
1012 lsl x1, x1, #1
1013 bl idct16_half
1014
1015 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
1016 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
1017
1018 br x14
1019 endfunc
1020
1021 .macro idct16_partial size
1022 function idct16x16_\size\()_add_neon
1023 add x0, sp, #(0*32)
1024 add x2, x6, #(0*2)
1025 bl idct16_1d_8x16_pass1_\size\()_neon
1026 .irp i, 0, 8
1027 add x0, x4, #(\i)
1028 mov x1, x5
1029 add x2, sp, #(\i*2)
1030 mov x3, #\i
1031 bl idct16_1d_8x16_pass2_\size\()_neon
1032 .endr
1033
1034 add sp, sp, #512
1035 br x15
1036 endfunc
1037 .endm
1038
1039 idct16_partial quarter
1040 idct16_partial half
1041
1042 function idct32x32_dc_add_neon
1043 movrel x4, idct_coeffs
1044 ld1 {v0.4h}, [x4]
1045
1046 movi v1.4h, #0
1047
1048 ld1r {v2.4h}, [x2]
1049 smull v2.4s, v2.4h, v0.h[0]
1050 rshrn v2.4h, v2.4s, #14
1051 smull v2.4s, v2.4h, v0.h[0]
1052 rshrn v2.4h, v2.4s, #14
1053 dup v2.8h, v2.h[0]
1054 st1 {v1.h}[0], [x2]
1055
1056 srshr v0.8h, v2.8h, #6
1057
1058 mov x4, #32
1059 1:
1060 // Loop to add the constant v0 into all 32x32 outputs
1061 ld1 {v1.16b,v2.16b}, [x0]
1062 uaddw v3.8h, v0.8h, v1.8b
1063 uaddw2 v4.8h, v0.8h, v1.16b
1064 uaddw v5.8h, v0.8h, v2.8b
1065 uaddw2 v6.8h, v0.8h, v2.16b
1066 sqxtun v3.8b, v3.8h
1067 sqxtun2 v3.16b, v4.8h
1068 sqxtun v4.8b, v5.8h
1069 sqxtun2 v4.16b, v6.8h
1070 st1 {v3.16b,v4.16b}, [x0], x1
1071 subs x4, x4, #1
1072 b.ne 1b
1073
1074 ret
1075 endfunc
1076
1077 .macro idct32_end
1078 butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a
1079 butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18
1080 butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a
1081 butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21
1082 butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a
1083 butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26
1084 butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a
1085 butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29
1086
1087 dmbutterfly v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
1088 dmbutterfly v3, v5, v0.h[1], v0.h[2], v24, v25, v30, v31 // v3 = t19, v5 = t28
1089 dmbutterfly v28, v6, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
1090 dmbutterfly v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1091
1092 butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24
1093 butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1094 butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16
1095 butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1096 butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21
1097 butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
1098 butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26
1099 butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20
1100
1101 dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20
1102 dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
1103 dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22
1104 dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
1105 ret
1106 .endm
1107
1108 function idct32_odd
1109 ld1 {v0.8h,v1.8h}, [x11]
1110
1111 dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1112 dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1113 dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1114 dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1115 dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1116 dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1117 dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1118 dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1119
1120 ld1 {v0.8h}, [x10]
1121
1122 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
1123 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
1124 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
1125 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
1126 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
1127 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
1128 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
1129 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
1130
1131 dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1132 dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1133 dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1134 dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1135 idct32_end
1136 endfunc
1137
1138 function idct32_odd_half
1139 ld1 {v0.8h,v1.8h}, [x11]
1140
1141 dmbutterfly_h1 v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1142 dmbutterfly_h2 v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1143 dmbutterfly_h1 v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1144 dmbutterfly_h2 v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1145 dmbutterfly_h1 v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1146 dmbutterfly_h2 v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1147 dmbutterfly_h1 v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1148 dmbutterfly_h2 v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1149
1150 ld1 {v0.8h}, [x10]
1151
1152 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
1153 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
1154 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
1155 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
1156 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
1157 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
1158 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
1159 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
1160
1161 dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1162 dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1163 dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1164 dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1165 idct32_end
1166 endfunc
1167
1168 function idct32_odd_quarter
1169 ld1 {v0.8h,v1.8h}, [x11]
1170
1171 dsmull_h v4, v5, v16, v0.h[0]
1172 dsmull_h v28, v29, v19, v0.h[7]
1173 dsmull_h v30, v31, v16, v0.h[1]
1174 dsmull_h v22, v23, v17, v1.h[6]
1175 dsmull_h v7, v6, v17, v1.h[7]
1176 dsmull_h v26, v27, v19, v0.h[6]
1177 dsmull_h v20, v21, v18, v1.h[0]
1178 dsmull_h v24, v25, v18, v1.h[1]
1179
1180 ld1 {v0.8h}, [x10]
1181
1182 neg v28.4s, v28.4s
1183 neg v29.4s, v29.4s
1184 neg v7.4s, v7.4s
1185 neg v6.4s, v6.4s
1186
1187 drshrn_h v4, v4, v5, #14
1188 drshrn_h v5, v28, v29, #14
1189 drshrn_h v29, v30, v31, #14
1190 drshrn_h v28, v22, v23, #14
1191 drshrn_h v7, v7, v6, #14
1192 drshrn_h v31, v26, v27, #14
1193 drshrn_h v6, v20, v21, #14
1194 drshrn_h v30, v24, v25, #14
1195
1196 dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[3], v0.h[4]
1197 dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[3], v0.h[4]
1198 drshrn_h v23, v16, v17, #14
1199 drshrn_h v24, v18, v19, #14
1200 neg v20.4s, v20.4s
1201 neg v21.4s, v21.4s
1202 drshrn_h v27, v27, v26, #14
1203 drshrn_h v20, v20, v21, #14
1204 dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[5], v0.h[6]
1205 drshrn_h v21, v16, v17, #14
1206 drshrn_h v26, v18, v19, #14
1207 dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[5], v0.h[6]
1208 drshrn_h v25, v16, v17, #14
1209 neg v18.4s, v18.4s
1210 neg v19.4s, v19.4s
1211 drshrn_h v22, v18, v19, #14
1212
1213 idct32_end
1214 endfunc
1215
1216 .macro idct32_funcs suffix
1217 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
1218 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1219 // a normal IDCT16 with every other input component (the even ones, with
1220 // each output written twice), followed by a separate 16-point IDCT
1221 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1222 // x0 = dst (temp buffer)
1223 // x1 = unused
1224 // x2 = src
1225 // x9 = double input stride
1226 // x10 = idct_coeffs
1227 // x11 = idct_coeffs + 32
1228 function idct32_1d_8x32_pass1\suffix\()_neon
1229 mov x14, x30
1230 ld1 {v0.8h,v1.8h}, [x10]
1231
1232 movi v2.8h, #0
1233
1234 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1235 .ifb \suffix
1236 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1237 load_clear \i, x2, x9
1238 .endr
1239 .endif
1240 .ifc \suffix,_quarter
1241 .irp i, 16, 17, 18, 19
1242 load_clear \i, x2, x9
1243 .endr
1244 .endif
1245 .ifc \suffix,_half
1246 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1247 load_clear \i, x2, x9
1248 .endr
1249 .endif
1250
1251 bl idct16\suffix
1252
1253 // Do two 8x8 transposes. Originally, v16-v31 contain the
1254 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
1255 // two transposed 8x8 blocks.
1256 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
1257 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
1258
1259 // Store the registers a, b horizontally, followed by the
1260 // same registers b, a mirrored.
1261 .macro store_rev a, b
1262 // There's no rev128 instruction, but we reverse each 64 bit
1263 // half, and then flip them using an ext with 8 bytes offset.
1264 rev64 v1.8h, \b
1265 st1 {\a}, [x0], #16
1266 rev64 v0.8h, \a
1267 ext v1.16b, v1.16b, v1.16b, #8
1268 st1 {\b}, [x0], #16
1269 ext v0.16b, v0.16b, v0.16b, #8
1270 st1 {v1.8h}, [x0], #16
1271 st1 {v0.8h}, [x0], #16
1272 .endm
1273 store_rev v16.8h, v24.8h
1274 store_rev v17.8h, v25.8h
1275 store_rev v18.8h, v26.8h
1276 store_rev v19.8h, v27.8h
1277 store_rev v20.8h, v28.8h
1278 store_rev v21.8h, v29.8h
1279 store_rev v22.8h, v30.8h
1280 store_rev v23.8h, v31.8h
1281 sub x0, x0, #512
1282 .purgem store_rev
1283
1284 // Move x2 back to the start of the input, and move
1285 // to the first odd row
1286 .ifb \suffix
1287 sub x2, x2, x9, lsl #4
1288 .endif
1289 .ifc \suffix,_quarter
1290 sub x2, x2, x9, lsl #2
1291 .endif
1292 .ifc \suffix,_half
1293 sub x2, x2, x9, lsl #3
1294 .endif
1295 add x2, x2, #64
1296
1297 movi v2.8h, #0
1298 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1299 .ifb \suffix
1300 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1301 load_clear \i, x2, x9
1302 .endr
1303 .endif
1304 .ifc \suffix,_quarter
1305 .irp i, 16, 17, 18, 19
1306 load_clear \i, x2, x9
1307 .endr
1308 .endif
1309 .ifc \suffix,_half
1310 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1311 load_clear \i, x2, x9
1312 .endr
1313 .endif
1314
1315 bl idct32_odd\suffix
1316
1317 transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
1318 transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
1319
1320 // Store the registers a, b horizontally,
1321 // adding into the output first, and the mirrored,
1322 // subtracted from the output.
1323 .macro store_rev a, b
1324 ld1 {v4.8h}, [x0]
1325 rev64 v1.8h, \b
1326 add v4.8h, v4.8h, \a
1327 rev64 v0.8h, \a
1328 st1 {v4.8h}, [x0], #16
1329 ext v1.16b, v1.16b, v1.16b, #8
1330 ld1 {v5.8h}, [x0]
1331 ext v0.16b, v0.16b, v0.16b, #8
1332 add v5.8h, v5.8h, \b
1333 st1 {v5.8h}, [x0], #16
1334 ld1 {v6.8h}, [x0]
1335 sub v6.8h, v6.8h, v1.8h
1336 st1 {v6.8h}, [x0], #16
1337 ld1 {v7.8h}, [x0]
1338 sub v7.8h, v7.8h, v0.8h
1339 st1 {v7.8h}, [x0], #16
1340 .endm
1341
1342 store_rev v31.8h, v23.8h
1343 store_rev v30.8h, v22.8h
1344 store_rev v29.8h, v21.8h
1345 store_rev v28.8h, v20.8h
1346 store_rev v27.8h, v19.8h
1347 store_rev v26.8h, v18.8h
1348 store_rev v25.8h, v17.8h
1349 store_rev v24.8h, v16.8h
1350 .purgem store_rev
1351 br x14
1352 endfunc
1353
1354 // This is mostly the same as 8x32_pass1, but without the transpose,
1355 // and use the source as temp buffer between the two idct passes, and
1356 // add into the destination.
1357 // x0 = dst
1358 // x1 = dst stride
1359 // x2 = src (temp buffer)
1360 // x7 = negative double temp buffer stride
1361 // x9 = double temp buffer stride
1362 // x10 = idct_coeffs
1363 // x11 = idct_coeffs + 32
1364 function idct32_1d_8x32_pass2\suffix\()_neon
1365 mov x14, x30
1366 ld1 {v0.8h,v1.8h}, [x10]
1367
1368 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1369 .ifb \suffix
1370 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1371 load \i, x2, x9
1372 .endr
1373 sub x2, x2, x9, lsl #4
1374 .endif
1375 .ifc \suffix,_quarter
1376 .irp i, 16, 17, 18, 19
1377 load \i, x2, x9
1378 .endr
1379 sub x2, x2, x9, lsl #2
1380 .endif
1381 .ifc \suffix,_half
1382 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1383 load \i, x2, x9
1384 .endr
1385 sub x2, x2, x9, lsl #3
1386 .endif
1387
1388 bl idct16\suffix
1389
1390 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1391 store \i, x2, x9
1392 .endr
1393
1394 sub x2, x2, x9, lsl #4
1395 add x2, x2, #64
1396
1397 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1398 .ifb \suffix
1399 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1400 load \i, x2, x9
1401 .endr
1402 sub x2, x2, x9, lsl #4
1403 .endif
1404 .ifc \suffix,_quarter
1405 .irp i, 16, 17, 18, 19
1406 load \i, x2, x9
1407 .endr
1408 sub x2, x2, x9, lsl #2
1409 .endif
1410 .ifc \suffix,_half
1411 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1412 load \i, x2, x9
1413 .endr
1414 sub x2, x2, x9, lsl #3
1415 .endif
1416 sub x2, x2, #64
1417
1418 bl idct32_odd\suffix
1419
1420 .macro load_acc_store a, b, c, d, neg=0
1421 .if \neg == 0
1422 ld1 {v4.8h}, [x2], x9
1423 ld1 {v5.8h}, [x2], x9
1424 add v4.8h, v4.8h, \a
1425 ld1 {v6.8h}, [x2], x9
1426 add v5.8h, v5.8h, \b
1427 ld1 {v7.8h}, [x2], x9
1428 add v6.8h, v6.8h, \c
1429 add v7.8h, v7.8h, \d
1430 .else
1431 ld1 {v4.8h}, [x2], x7
1432 ld1 {v5.8h}, [x2], x7
1433 sub v4.8h, v4.8h, \a
1434 ld1 {v6.8h}, [x2], x7
1435 sub v5.8h, v5.8h, \b
1436 ld1 {v7.8h}, [x2], x7
1437 sub v6.8h, v6.8h, \c
1438 sub v7.8h, v7.8h, \d
1439 .endif
1440 ld1 {v0.8b}, [x0], x1
1441 ld1 {v1.8b}, [x0], x1
1442 srshr v4.8h, v4.8h, #6
1443 ld1 {v2.8b}, [x0], x1
1444 srshr v5.8h, v5.8h, #6
1445 uaddw v4.8h, v4.8h, v0.8b
1446 ld1 {v3.8b}, [x0], x1
1447 srshr v6.8h, v6.8h, #6
1448 uaddw v5.8h, v5.8h, v1.8b
1449 srshr v7.8h, v7.8h, #6
1450 sub x0, x0, x1, lsl #2
1451 uaddw v6.8h, v6.8h, v2.8b
1452 sqxtun v4.8b, v4.8h
1453 uaddw v7.8h, v7.8h, v3.8b
1454 sqxtun v5.8b, v5.8h
1455 st1 {v4.8b}, [x0], x1
1456 sqxtun v6.8b, v6.8h
1457 st1 {v5.8b}, [x0], x1
1458 sqxtun v7.8b, v7.8h
1459 st1 {v6.8b}, [x0], x1
1460 st1 {v7.8b}, [x0], x1
1461 .endm
1462 load_acc_store v31.8h, v30.8h, v29.8h, v28.8h
1463 load_acc_store v27.8h, v26.8h, v25.8h, v24.8h
1464 load_acc_store v23.8h, v22.8h, v21.8h, v20.8h
1465 load_acc_store v19.8h, v18.8h, v17.8h, v16.8h
1466 sub x2, x2, x9
1467 load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1
1468 load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1
1469 load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
1470 load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
1471 .purgem load_acc_store
1472 br x14
1473 endfunc
1474 .endm
1475
1476 idct32_funcs
1477 idct32_funcs _quarter
1478 idct32_funcs _half
1479
1480 const min_eob_idct_idct_32, align=4
1481 .short 0, 34, 135, 336
1482 endconst
1483
1484 function ff_vp9_idct_idct_32x32_add_neon, export=1
1485 cmp w3, #1
1486 b.eq idct32x32_dc_add_neon
1487
1488 movrel x10, idct_coeffs
1489 add x11, x10, #32
1490 movrel x12, min_eob_idct_idct_32, 2
1491
1492 mov x15, x30
1493
1494 stp d14, d15, [sp, #-0x10]!
1495 stp d12, d13, [sp, #-0x10]!
1496 stp d10, d11, [sp, #-0x10]!
1497 stp d8, d9, [sp, #-0x10]!
1498
1499 sub sp, sp, #2048
1500
1501 mov x4, x0
1502 mov x5, x1
1503 mov x6, x2
1504
1505 // Double stride of the input, since we only read every other line
1506 mov x9, #128
1507 neg x7, x9
1508
1509 cmp w3, #34
1510 b.le idct32x32_quarter_add_neon
1511 cmp w3, #135
1512 b.le idct32x32_half_add_neon
1513
1514 .irp i, 0, 8, 16, 24
1515 add x0, sp, #(\i*64)
1516 .if \i > 0
1517 ldrh w1, [x12], #2
1518 cmp w3, w1
1519 mov x1, #(32 - \i)/4
1520 b.le 1f
1521 .endif
1522 add x2, x6, #(\i*2)
1523 bl idct32_1d_8x32_pass1_neon
1524 .endr
1525 b 3f
1526
1527 1:
1528 // Write zeros to the temp buffer for pass 2
1529 movi v16.8h, #0
1530 movi v17.8h, #0
1531 movi v18.8h, #0
1532 movi v19.8h, #0
1533 2:
1534 subs x1, x1, #1
1535 .rept 4
1536 st1 {v16.8h-v19.8h}, [x0], #64
1537 .endr
1538 b.ne 2b
1539 3:
1540 .irp i, 0, 8, 16, 24
1541 add x0, x4, #(\i)
1542 mov x1, x5
1543 add x2, sp, #(\i*2)
1544 bl idct32_1d_8x32_pass2_neon
1545 .endr
1546
1547 add sp, sp, #2048
1548
1549 ldp d8, d9, [sp], 0x10
1550 ldp d10, d11, [sp], 0x10
1551 ldp d12, d13, [sp], 0x10
1552 ldp d14, d15, [sp], 0x10
1553
1554 br x15
1555 endfunc
1556
1557 .macro idct32_partial size
1558 function idct32x32_\size\()_add_neon
1559 add x0, sp, #(0*64)
1560 add x2, x6, #(0*2)
1561 bl idct32_1d_8x32_pass1_\size\()_neon
1562 .ifc \size,half
1563 add x0, sp, #(8*64)
1564 add x2, x6, #(8*2)
1565 bl idct32_1d_8x32_pass1_\size\()_neon
1566 .endif
1567 .irp i, 0, 8, 16, 24
1568 add x0, x4, #(\i)
1569 mov x1, x5
1570 add x2, sp, #(\i*2)
1571 bl idct32_1d_8x32_pass2_\size\()_neon
1572 .endr
1573
1574 add sp, sp, #2048
1575
1576 ldp d8, d9, [sp], 0x10
1577 ldp d10, d11, [sp], 0x10
1578 ldp d12, d13, [sp], 0x10
1579 ldp d14, d15, [sp], 0x10
1580
1581 br x15
1582 endfunc
1583 .endm
1584
1585 idct32_partial quarter
1586 idct32_partial half