aarch64: vp9itxfm: Optimize 16x16 and 32x32 idct dc by unrolling
[libav.git] / libavcodec / aarch64 / vp9itxfm_neon.S
CommitLineData
3c9546df
MS
1/*
2 * Copyright (c) 2016 Google Inc.
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22#include "neon.S"
23
24const itxfm4_coeffs, align=4
25 .short 11585, 6270, 15137, 0
26iadst4_coeffs:
27 .short 5283, 15212, 9929, 13377
28endconst
29
30const iadst8_coeffs, align=4
31 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32idct_coeffs:
33 .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
34 .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
35 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37endconst
38
39const iadst16_coeffs, align=4
40 .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
41 .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
42endconst
43
8476eb0d
MS
44// out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
45// out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
3c9546df
MS
46// in/out are .8h registers; this can do with 4 temp registers, but is
47// more efficient if 6 temp registers are available.
48.macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
49.if \neg > 0
50 neg \tmp4\().4h, v0.4h
51.endif
52 add \tmp1\().8h, \in1\().8h, \in2\().8h
53 sub \tmp2\().8h, \in1\().8h, \in2\().8h
54.if \neg > 0
55 smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
56 smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
57.else
58 smull \tmp3\().4s, \tmp1\().4h, v0.h[0]
59 smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0]
60.endif
61.ifb \tmp5
62 rshrn \out1\().4h, \tmp3\().4s, #14
63 rshrn2 \out1\().8h, \tmp4\().4s, #14
64 smull \tmp3\().4s, \tmp2\().4h, v0.h[0]
65 smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0]
66 rshrn \out2\().4h, \tmp3\().4s, #14
67 rshrn2 \out2\().8h, \tmp4\().4s, #14
68.else
69 smull \tmp5\().4s, \tmp2\().4h, v0.h[0]
70 smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0]
71 rshrn \out1\().4h, \tmp3\().4s, #14
72 rshrn2 \out1\().8h, \tmp4\().4s, #14
73 rshrn \out2\().4h, \tmp5\().4s, #14
74 rshrn2 \out2\().8h, \tmp6\().4s, #14
75.endif
76.endm
77
a63da451
MS
78// Same as dmbutterfly0 above, but treating the input in in2 as zero,
79// writing the same output into both out1 and out2.
80.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
81 smull \tmp1\().4s, \in1\().4h, v0.h[0]
82 smull2 \tmp2\().4s, \in1\().8h, v0.h[0]
83 rshrn \out1\().4h, \tmp1\().4s, #14
84 rshrn2 \out1\().8h, \tmp2\().4s, #14
85 rshrn \out2\().4h, \tmp1\().4s, #14
86 rshrn2 \out2\().8h, \tmp2\().4s, #14
87.endm
88
3c9546df
MS
89// out1,out2 = in1 * coef1 - in2 * coef2
90// out3,out4 = in1 * coef2 + in2 * coef1
91// out are 4 x .4s registers, in are 2 x .8h registers
92.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
93 smull \out1\().4s, \in1\().4h, \coef1
94 smull2 \out2\().4s, \in1\().8h, \coef1
95 smull \out3\().4s, \in1\().4h, \coef2
96 smull2 \out4\().4s, \in1\().8h, \coef2
97 smlsl \out1\().4s, \in2\().4h, \coef2
98 smlsl2 \out2\().4s, \in2\().8h, \coef2
99 smlal \out3\().4s, \in2\().4h, \coef1
100 smlal2 \out4\().4s, \in2\().8h, \coef1
101.endm
102
103// inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
104// inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
105// inout are 2 x .8h registers
106.macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
107 dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
108.if \neg > 0
109 neg \tmp3\().4s, \tmp3\().4s
110 neg \tmp4\().4s, \tmp4\().4s
111.endif
112 rshrn \inout1\().4h, \tmp1\().4s, #14
113 rshrn2 \inout1\().8h, \tmp2\().4s, #14
114 rshrn \inout2\().4h, \tmp3\().4s, #14
115 rshrn2 \inout2\().8h, \tmp4\().4s, #14
116.endm
117
a63da451
MS
118// Same as dmbutterfly above, but treating the input in inout2 as zero
119.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
120 smull \tmp1\().4s, \inout1\().4h, \coef1
121 smull2 \tmp2\().4s, \inout1\().8h, \coef1
122 smull \tmp3\().4s, \inout1\().4h, \coef2
123 smull2 \tmp4\().4s, \inout1\().8h, \coef2
124 rshrn \inout1\().4h, \tmp1\().4s, #14
125 rshrn2 \inout1\().8h, \tmp2\().4s, #14
126 rshrn \inout2\().4h, \tmp3\().4s, #14
127 rshrn2 \inout2\().8h, \tmp4\().4s, #14
128.endm
129
130// Same as dmbutterfly above, but treating the input in inout1 as zero
131.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
132 smull \tmp1\().4s, \inout2\().4h, \coef2
133 smull2 \tmp2\().4s, \inout2\().8h, \coef2
134 smull \tmp3\().4s, \inout2\().4h, \coef1
135 smull2 \tmp4\().4s, \inout2\().8h, \coef1
136 neg \tmp1\().4s, \tmp1\().4s
137 neg \tmp2\().4s, \tmp2\().4s
138 rshrn \inout2\().4h, \tmp3\().4s, #14
139 rshrn2 \inout2\().8h, \tmp4\().4s, #14
140 rshrn \inout1\().4h, \tmp1\().4s, #14
141 rshrn2 \inout1\().8h, \tmp2\().4s, #14
142.endm
143
144.macro dsmull_h out1, out2, in, coef
145 smull \out1\().4s, \in\().4h, \coef
146 smull2 \out2\().4s, \in\().8h, \coef
147.endm
148
149.macro drshrn_h out, in1, in2, shift
150 rshrn \out\().4h, \in1\().4s, \shift
151 rshrn2 \out\().8h, \in2\().4s, \shift
152.endm
153
154
3c9546df
MS
155// out1 = in1 + in2
156// out2 = in1 - in2
157.macro butterfly_8h out1, out2, in1, in2
158 add \out1\().8h, \in1\().8h, \in2\().8h
159 sub \out2\().8h, \in1\().8h, \in2\().8h
160.endm
161
162// out1 = in1 - in2
163// out2 = in1 + in2
164.macro butterfly_8h_r out1, out2, in1, in2
165 sub \out1\().8h, \in1\().8h, \in2\().8h
166 add \out2\().8h, \in1\().8h, \in2\().8h
167.endm
168
169// out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
170// out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
171// out are 2 x .8h registers, in are 4 x .4s registers
172.macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
173 add \tmp1\().4s, \in1\().4s, \in3\().4s
174 add \tmp2\().4s, \in2\().4s, \in4\().4s
175 sub \tmp3\().4s, \in1\().4s, \in3\().4s
176 sub \tmp4\().4s, \in2\().4s, \in4\().4s
177 rshrn \out1\().4h, \tmp1\().4s, #14
178 rshrn2 \out1\().8h, \tmp2\().4s, #14
179 rshrn \out2\().4h, \tmp3\().4s, #14
180 rshrn2 \out2\().8h, \tmp4\().4s, #14
181.endm
182
183.macro iwht4 c0, c1, c2, c3
184 add \c0\().4h, \c0\().4h, \c1\().4h
185 sub v17.4h, \c2\().4h, \c3\().4h
186 sub v16.4h, \c0\().4h, v17.4h
187 sshr v16.4h, v16.4h, #1
188 sub \c2\().4h, v16.4h, \c1\().4h
189 sub \c1\().4h, v16.4h, \c3\().4h
190 add \c3\().4h, v17.4h, \c2\().4h
191 sub \c0\().4h, \c0\().4h, \c1\().4h
192.endm
193
194.macro idct4 c0, c1, c2, c3
195 smull v22.4s, \c1\().4h, v0.h[2]
196 smull v20.4s, \c1\().4h, v0.h[1]
197 add v16.4h, \c0\().4h, \c2\().4h
198 sub v17.4h, \c0\().4h, \c2\().4h
199 smlal v22.4s, \c3\().4h, v0.h[1]
200 smull v18.4s, v16.4h, v0.h[0]
201 smull v19.4s, v17.4h, v0.h[0]
202 smlsl v20.4s, \c3\().4h, v0.h[2]
203 rshrn v22.4h, v22.4s, #14
204 rshrn v18.4h, v18.4s, #14
205 rshrn v19.4h, v19.4s, #14
206 rshrn v20.4h, v20.4s, #14
207 add \c0\().4h, v18.4h, v22.4h
208 sub \c3\().4h, v18.4h, v22.4h
209 add \c1\().4h, v19.4h, v20.4h
210 sub \c2\().4h, v19.4h, v20.4h
211.endm
212
213.macro iadst4 c0, c1, c2, c3
214 smull v16.4s, \c0\().4h, v0.h[4]
215 smlal v16.4s, \c2\().4h, v0.h[5]
216 smlal v16.4s, \c3\().4h, v0.h[6]
217 smull v17.4s, \c0\().4h, v0.h[6]
218 smlsl v17.4s, \c2\().4h, v0.h[4]
219 sub \c0\().4h, \c0\().4h, \c2\().4h
220 smlsl v17.4s, \c3\().4h, v0.h[5]
221 add \c0\().4h, \c0\().4h, \c3\().4h
222 smull v19.4s, \c1\().4h, v0.h[7]
223 smull v18.4s, \c0\().4h, v0.h[7]
224 add v20.4s, v16.4s, v19.4s
225 add v21.4s, v17.4s, v19.4s
226 rshrn \c0\().4h, v20.4s, #14
227 add v16.4s, v16.4s, v17.4s
0c0b87f1 228 rshrn \c1\().4h, v21.4s, #14
3c9546df
MS
229 sub v16.4s, v16.4s, v19.4s
230 rshrn \c2\().4h, v18.4s, #14
231 rshrn \c3\().4h, v16.4s, #14
232.endm
233
234// The public functions in this file have got the following signature:
235// void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
236
237.macro itxfm_func4x4 txfm1, txfm2
238function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
239.ifc \txfm1,\txfm2
240.ifc \txfm1,idct
241 movrel x4, itxfm4_coeffs
242 ld1 {v0.4h}, [x4]
243.endif
244.ifc \txfm1,iadst
245 movrel x4, iadst4_coeffs
246 ld1 {v0.d}[1], [x4]
247.endif
248.else
249 movrel x4, itxfm4_coeffs
250 ld1 {v0.8h}, [x4]
251.endif
252
253 movi v31.8h, #0
254.ifc \txfm1\()_\txfm2,idct_idct
4d960a11 255 cmp w3, #1
3c9546df
MS
256 b.ne 1f
257 // DC-only for idct/idct
ed8d2933 258 ld1 {v2.h}[0], [x2]
3c9546df
MS
259 smull v2.4s, v2.4h, v0.h[0]
260 rshrn v2.4h, v2.4s, #14
261 smull v2.4s, v2.4h, v0.h[0]
262 rshrn v2.4h, v2.4s, #14
263 st1 {v31.h}[0], [x2]
264 dup v4.4h, v2.h[0]
265 mov v5.16b, v4.16b
266 mov v6.16b, v4.16b
267 mov v7.16b, v4.16b
268 b 2f
269.endif
270
2711:
272 ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2]
273 st1 {v31.8h}, [x2], #16
274
275.ifc \txfm1,iwht
276 sshr v4.4h, v4.4h, #2
277 sshr v5.4h, v5.4h, #2
278 sshr v6.4h, v6.4h, #2
279 sshr v7.4h, v7.4h, #2
280.endif
281
282 \txfm1\()4 v4, v5, v6, v7
283
284 st1 {v31.8h}, [x2], #16
285 // Transpose 4x4 with 16 bit elements
286 transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19
287
288 \txfm2\()4 v4, v5, v6, v7
2892:
ed8d2933
MS
290 ld1 {v0.s}[0], [x0], x1
291 ld1 {v1.s}[0], [x0], x1
3c9546df
MS
292.ifnc \txfm1,iwht
293 srshr v4.4h, v4.4h, #4
294 srshr v5.4h, v5.4h, #4
295 srshr v6.4h, v6.4h, #4
296 srshr v7.4h, v7.4h, #4
297.endif
298 uaddw v4.8h, v4.8h, v0.8b
299 uaddw v5.8h, v5.8h, v1.8b
ed8d2933
MS
300 ld1 {v2.s}[0], [x0], x1
301 ld1 {v3.s}[0], [x0], x1
3c9546df
MS
302 sqxtun v0.8b, v4.8h
303 sqxtun v1.8b, v5.8h
304 sub x0, x0, x1, lsl #2
305
306 uaddw v6.8h, v6.8h, v2.8b
307 uaddw v7.8h, v7.8h, v3.8b
308 st1 {v0.s}[0], [x0], x1
309 sqxtun v2.8b, v6.8h
310 sqxtun v3.8b, v7.8h
311
312 st1 {v1.s}[0], [x0], x1
313 st1 {v2.s}[0], [x0], x1
314 st1 {v3.s}[0], [x0], x1
315
316 ret
317endfunc
318.endm
319
320itxfm_func4x4 idct, idct
321itxfm_func4x4 iadst, idct
322itxfm_func4x4 idct, iadst
323itxfm_func4x4 iadst, iadst
324itxfm_func4x4 iwht, iwht
325
326
327.macro idct8
328 dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
329 dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
330 dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
331 dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
332
333 butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3
334 butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a
335 butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a
336 butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2
337
338 dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
339
340 butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
341 butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
342 butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
343 butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
344.endm
345
346.macro iadst8
347 dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a
348 dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a
349 dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a
350 dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a
351
352 dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4
353 dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5
354 dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6
355 dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7
356
357 butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2
358 butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3
359 neg v23.8h, v23.8h // v23 = out[7]
360
361 dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4]
362 neg v19.8h, v19.8h // v19 = out[3]
363
364 dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[1], v0.h[2] // v26,v27 = t5a, v28,v29 = t4a
365 dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[2], v0.h[1] // v2,v3 = t6a, v4,v5 = t7a
366
367 dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6
368 dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7
369 neg v17.8h, v17.8h // v17 = out[1]
370
371 dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5]
372 neg v21.8h, v21.8h // v21 = out[5]
373.endm
374
375
376.macro itxfm_func8x8 txfm1, txfm2
377function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
378 // The iadst also uses a few coefficients from
379 // idct, so those always need to be loaded.
380.ifc \txfm1\()_\txfm2,idct_idct
381 movrel x4, idct_coeffs
3c9546df
MS
382.else
383 movrel x4, iadst8_coeffs
384 ld1 {v1.8h}, [x4], #16
3c9546df 385.endif
4da4b2b8 386 ld1 {v0.8h}, [x4]
3c9546df 387
3dd78272
MS
388 movi v2.8h, #0
389 movi v3.8h, #0
390 movi v4.8h, #0
391 movi v5.8h, #0
3c9546df
MS
392
393.ifc \txfm1\()_\txfm2,idct_idct
4d960a11 394 cmp w3, #1
3c9546df
MS
395 b.ne 1f
396 // DC-only for idct/idct
ed8d2933 397 ld1 {v2.h}[0], [x2]
3c9546df
MS
398 smull v2.4s, v2.4h, v0.h[0]
399 rshrn v2.4h, v2.4s, #14
400 smull v2.4s, v2.4h, v0.h[0]
401 rshrn v2.4h, v2.4s, #14
402 st1 {v3.h}[0], [x2]
403 dup v16.8h, v2.h[0]
404 mov v17.16b, v16.16b
405 mov v18.16b, v16.16b
406 mov v19.16b, v16.16b
407 mov v20.16b, v16.16b
408 mov v21.16b, v16.16b
409 mov v22.16b, v16.16b
410 mov v23.16b, v16.16b
411 b 2f
412.endif
4131:
3dd78272
MS
414 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64
415 ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64
3c9546df 416 sub x2, x2, #128
3dd78272
MS
417 st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
418 st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
3c9546df
MS
419
420 \txfm1\()8
421
422 // Transpose 8x8 with 16 bit elements
423 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
424
425 \txfm2\()8
4262:
427 mov x3, x0
428 // Add into the destination
429 ld1 {v0.8b}, [x0], x1
430 srshr v16.8h, v16.8h, #5
431 ld1 {v1.8b}, [x0], x1
432 srshr v17.8h, v17.8h, #5
433 ld1 {v2.8b}, [x0], x1
434 srshr v18.8h, v18.8h, #5
435 uaddw v16.8h, v16.8h, v0.8b
436 ld1 {v3.8b}, [x0], x1
437 srshr v19.8h, v19.8h, #5
438 uaddw v17.8h, v17.8h, v1.8b
439 ld1 {v4.8b}, [x0], x1
440 srshr v20.8h, v20.8h, #5
441 uaddw v18.8h, v18.8h, v2.8b
442 sqxtun v0.8b, v16.8h
443 ld1 {v5.8b}, [x0], x1
444 srshr v21.8h, v21.8h, #5
445 uaddw v19.8h, v19.8h, v3.8b
446 sqxtun v1.8b, v17.8h
447 ld1 {v6.8b}, [x0], x1
448 srshr v22.8h, v22.8h, #5
449 uaddw v20.8h, v20.8h, v4.8b
450 sqxtun v2.8b, v18.8h
451 ld1 {v7.8b}, [x0], x1
452 srshr v23.8h, v23.8h, #5
453 uaddw v21.8h, v21.8h, v5.8b
454 sqxtun v3.8b, v19.8h
455
456 st1 {v0.8b}, [x3], x1
457 uaddw v22.8h, v22.8h, v6.8b
458 st1 {v1.8b}, [x3], x1
459 sqxtun v4.8b, v20.8h
460 st1 {v2.8b}, [x3], x1
461 uaddw v23.8h, v23.8h, v7.8b
462 st1 {v3.8b}, [x3], x1
463 sqxtun v5.8b, v21.8h
464 st1 {v4.8b}, [x3], x1
465 sqxtun v6.8b, v22.8h
466 st1 {v5.8b}, [x3], x1
467 sqxtun v7.8b, v23.8h
468
469 st1 {v6.8b}, [x3], x1
470 st1 {v7.8b}, [x3], x1
471
472 ret
473endfunc
474.endm
475
476itxfm_func8x8 idct, idct
477itxfm_func8x8 iadst, idct
478itxfm_func8x8 idct, iadst
479itxfm_func8x8 iadst, iadst
480
481
482function idct16x16_dc_add_neon
483 movrel x4, idct_coeffs
484 ld1 {v0.4h}, [x4]
485
486 movi v1.4h, #0
487
ed8d2933 488 ld1 {v2.h}[0], [x2]
3c9546df
MS
489 smull v2.4s, v2.4h, v0.h[0]
490 rshrn v2.4h, v2.4s, #14
491 smull v2.4s, v2.4h, v0.h[0]
492 rshrn v2.4h, v2.4s, #14
493 dup v2.8h, v2.h[0]
494 st1 {v1.h}[0], [x2]
495
496 srshr v2.8h, v2.8h, #6
497
3fcf788f 498 mov x3, x0
3c9546df
MS
499 mov x4, #16
5001:
501 // Loop to add the constant from v2 into all 16x16 outputs
3fcf788f
MS
502 subs x4, x4, #2
503 ld1 {v3.16b}, [x0], x1
504 ld1 {v4.16b}, [x0], x1
505 uaddw v16.8h, v2.8h, v3.8b
506 uaddw2 v17.8h, v2.8h, v3.16b
507 uaddw v18.8h, v2.8h, v4.8b
508 uaddw2 v19.8h, v2.8h, v4.16b
509 sqxtun v3.8b, v16.8h
510 sqxtun2 v3.16b, v17.8h
511 sqxtun v4.8b, v18.8h
512 sqxtun2 v4.16b, v19.8h
513 st1 {v3.16b}, [x3], x1
514 st1 {v4.16b}, [x3], x1
3c9546df
MS
515 b.ne 1b
516
517 ret
518endfunc
519
a63da451
MS
520.macro idct16_end
521 butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a
522 butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6
523 butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5
524 butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4
525 butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a
526 butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10
527 butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13
528 butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a
529
530 dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a
531 dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
532
533 butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
534 butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
535 butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
536 butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
537 butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13]
538 butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
539 butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
540 butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10]
541 ret
542.endm
543
11547601 544function idct16
3c9546df
MS
545 dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
546 dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
547 dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
548 dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
549 dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
550 dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
551 dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
552 dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
553
554 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
555 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
556 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
557 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
558 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
559 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
560 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
561 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
562
563 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
564 dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
565 dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
a63da451
MS
566 idct16_end
567endfunc
3c9546df 568
a63da451
MS
569function idct16_half
570 dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
571 dmbutterfly_h1 v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
572 dmbutterfly_h1 v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
573 dmbutterfly_h2 v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
574 dmbutterfly_h1 v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
575 dmbutterfly_h2 v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
576 dmbutterfly_h1 v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
577 dmbutterfly_h2 v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
3c9546df 578
a63da451
MS
579 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
580 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
581 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
582 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
583 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
584 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
585 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
586 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
3c9546df 587
a63da451
MS
588 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
589 dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
590 dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
591 idct16_end
592endfunc
593
594function idct16_quarter
595 dsmull_h v24, v25, v19, v1.h[6]
596 dsmull_h v4, v5, v17, v0.h[7]
597 dsmull_h v7, v6, v18, v0.h[4]
598 dsmull_h v30, v31, v18, v0.h[3]
599 neg v24.4s, v24.4s
600 neg v25.4s, v25.4s
601 dsmull_h v29, v28, v17, v1.h[0]
602 dsmull_h v26, v27, v19, v1.h[5]
603 dsmull_h v22, v23, v16, v0.h[0]
604 drshrn_h v24, v24, v25, #14
605 drshrn_h v16, v4, v5, #14
606 drshrn_h v7, v7, v6, #14
607 drshrn_h v6, v30, v31, #14
608 drshrn_h v29, v29, v28, #14
609 drshrn_h v17, v26, v27, #14
610 drshrn_h v28, v22, v23, #14
611
612 dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2]
613 dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2]
614 neg v22.4s, v22.4s
615 neg v23.4s, v23.4s
616 drshrn_h v27, v20, v21, #14
617 drshrn_h v21, v22, v23, #14
618 drshrn_h v23, v18, v19, #14
619 drshrn_h v25, v30, v31, #14
620 mov v4.16b, v28.16b
621 mov v5.16b, v28.16b
622 dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
623 mov v20.16b, v28.16b
624 idct16_end
11547601 625endfunc
3c9546df 626
11547601 627function iadst16
3c9546df
MS
628 ld1 {v0.8h,v1.8h}, [x11]
629
630 dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0
631 dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.h[1], v1.h[0] // v10,v11 = t9, v8,v9 = t8
632 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
633 dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2
634 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
635
636 dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.h[3], v1.h[2] // v6,v7 = t11, v4,v5 = t10
637 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
638 dmbutterfly_l v10, v11, v8, v9, v27, v20, v0.h[5], v0.h[4] // v10,v11 = t5, v8,v9 = t4
639 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
640
641 dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12
642 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
643 dmbutterfly_l v6, v7, v4, v5, v25, v22, v0.h[7], v0.h[6] // v6,v7 = t7, v4,v5 = t6
644 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
645
646 dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14
647 ld1 {v0.8h}, [x10]
648 dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
649 dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4] // v14,v15 = t9, v12,v13 = t8
650 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
651
652 dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[4], v0.h[3] // v4,v5 = t12, v6,v7 = t13
653 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
654 dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[5], v0.h[6] // v10,v11 = t11, v8,v9 = t10
655 butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0
656 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
657
658 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5] // v12,v13 = t14, v14,v15 = t15
659 butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1
660 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
661 dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
662
663 butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2
664 butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3
665
666 dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[1], v0.h[2] // v10,v11 = t13, v8,v9 = t12
667 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1] // v12,v13 = t14, v14,v15 = t15
668
669 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
670 dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
671 neg v29.8h, v29.8h // v29 = out[13]
672
673 dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[1], v0.h[2] // v10,v11 = t5a, v8,v9 = t4a
674 dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[2], v0.h[1] // v12,v13 = t6a, v14,v15 = t7a
675
676 butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a
677 butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10
678
679 dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
680 neg v19.8h, v19.8h // v19 = out[3]
681 dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
682
683 butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
684 butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11
685
686 dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
687 dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
688 dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
689 dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
690
691 neg v31.8h, v5.8h // v31 = out[15]
692 neg v17.8h, v3.8h // v17 = out[1]
693
694 mov v16.16b, v2.16b
695 mov v30.16b, v4.16b
11547601
MS
696 ret
697endfunc
3c9546df
MS
698
699// Helper macros; we can't use these expressions directly within
700// e.g. .irp due to the extra concatenation \(). Therefore wrap
701// them in macros to allow using .irp below.
702.macro load i, src, inc
703 ld1 {v\i\().8h}, [\src], \inc
704.endm
705.macro store i, dst, inc
706 st1 {v\i\().8h}, [\dst], \inc
707.endm
cad42fad
MS
708.macro movi_v i, size, imm
709 movi v\i\()\size, \imm
710.endm
3c9546df
MS
711.macro load_clear i, src, inc
712 ld1 {v\i\().8h}, [\src]
713 st1 {v2.8h}, [\src], \inc
714.endm
715
79d332eb
MS
716.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
717 srshr \coef0, \coef0, #6
718 ld1 {v2.8b}, [x0], x1
719 srshr \coef1, \coef1, #6
720 ld1 {v3.8b}, [x3], x1
721 srshr \coef2, \coef2, #6
722 ld1 {v4.8b}, [x0], x1
723 srshr \coef3, \coef3, #6
724 uaddw \coef0, \coef0, v2.8b
725 ld1 {v5.8b}, [x3], x1
726 uaddw \coef1, \coef1, v3.8b
727 srshr \coef4, \coef4, #6
728 ld1 {v6.8b}, [x0], x1
729 srshr \coef5, \coef5, #6
730 ld1 {v7.8b}, [x3], x1
731 sqxtun v2.8b, \coef0
732 srshr \coef6, \coef6, #6
733 sqxtun v3.8b, \coef1
734 srshr \coef7, \coef7, #6
735 uaddw \coef2, \coef2, v4.8b
736 ld1 {\tmp1}, [x0], x1
737 uaddw \coef3, \coef3, v5.8b
738 ld1 {\tmp2}, [x3], x1
739 sqxtun v4.8b, \coef2
740 sub x0, x0, x1, lsl #2
741 sub x3, x3, x1, lsl #2
742 sqxtun v5.8b, \coef3
743 uaddw \coef4, \coef4, v6.8b
744 st1 {v2.8b}, [x0], x1
745 uaddw \coef5, \coef5, v7.8b
746 st1 {v3.8b}, [x3], x1
747 sqxtun v6.8b, \coef4
748 st1 {v4.8b}, [x0], x1
749 sqxtun v7.8b, \coef5
750 st1 {v5.8b}, [x3], x1
751 uaddw \coef6, \coef6, \tmp1
752 st1 {v6.8b}, [x0], x1
753 uaddw \coef7, \coef7, \tmp2
754 st1 {v7.8b}, [x3], x1
755 sqxtun \tmp1, \coef6
756 sqxtun \tmp2, \coef7
757 st1 {\tmp1}, [x0], x1
758 st1 {\tmp2}, [x3], x1
759.endm
760
3c9546df
MS
761// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
762// transpose into a horizontal 16x8 slice and store.
763// x0 = dst (temp buffer)
cad42fad 764// x1 = slice offset
3c9546df 765// x2 = src
2f99117f 766// x9 = input stride
3c9546df
MS
767.macro itxfm16_1d_funcs txfm
768function \txfm\()16_1d_8x16_pass1_neon
11547601
MS
769 mov x14, x30
770
3c9546df
MS
771 movi v2.8h, #0
772.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
773 load_clear \i, x2, x9
774.endr
775
11547601 776 bl \txfm\()16
3c9546df
MS
777
778 // Do two 8x8 transposes. Originally, v16-v31 contain the
779 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
780 // transposed 8x8 blocks.
781 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
782 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
783
784 // Store the transposed 8x8 blocks horizontally.
cad42fad 785 cmp x1, #8
3c9546df
MS
786 b.eq 1f
787.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
788 store \i, x0, #16
789.endr
11547601 790 br x14
3c9546df 7911:
cad42fad 792 // Special case: For the last input column (x1 == 8),
3c9546df
MS
793 // which would be stored as the last row in the temp buffer,
794 // don't store the first 8x8 block, but keep it in registers
795 // for the first slice of the second pass (where it is the
796 // last 8x8 block).
797.irp i, 24, 25, 26, 27, 28, 29, 30, 31
798 add x0, x0, #16
799 store \i, x0, #16
800.endr
801 mov v24.16b, v16.16b
802 mov v25.16b, v17.16b
803 mov v26.16b, v18.16b
804 mov v27.16b, v19.16b
805 mov v28.16b, v20.16b
806 mov v29.16b, v21.16b
807 mov v30.16b, v22.16b
808 mov v31.16b, v23.16b
11547601 809 br x14
3c9546df
MS
810endfunc
811
812// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
813// load the destination pixels (from a similar 8x16 slice), add and store back.
814// x0 = dst
815// x1 = dst stride
816// x2 = src (temp buffer)
817// x3 = slice offset
2f99117f 818// x9 = temp buffer stride
3c9546df 819function \txfm\()16_1d_8x16_pass2_neon
11547601 820 mov x14, x30
3c9546df
MS
821.irp i, 16, 17, 18, 19, 20, 21, 22, 23
822 load \i, x2, x9
823.endr
824 cbz x3, 1f
825.irp i, 24, 25, 26, 27, 28, 29, 30, 31
826 load \i, x2, x9
827.endr
8281:
829
830 add x3, x0, x1
831 lsl x1, x1, #1
11547601 832 bl \txfm\()16
3c9546df 833
3c9546df
MS
834 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
835 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
3c9546df 836
11547601 837 br x14
3c9546df
MS
838endfunc
839.endm
840
841itxfm16_1d_funcs idct
842itxfm16_1d_funcs iadst
843
844.macro itxfm_func16x16 txfm1, txfm2
845function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
846.ifc \txfm1\()_\txfm2,idct_idct
4d960a11 847 cmp w3, #1
3c9546df
MS
848 b.eq idct16x16_dc_add_neon
849.endif
850 mov x15, x30
851 // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
852.ifnc \txfm1\()_\txfm2,idct_idct
853 stp d14, d15, [sp, #-0x10]!
854 stp d12, d13, [sp, #-0x10]!
855 stp d10, d11, [sp, #-0x10]!
856 stp d8, d9, [sp, #-0x10]!
857.endif
858
859 sub sp, sp, #512
860
861 mov x4, x0
862 mov x5, x1
863 mov x6, x2
864
865 movrel x10, idct_coeffs
866.ifnc \txfm1\()_\txfm2,idct_idct
867 movrel x11, iadst16_coeffs
868.endif
869.ifc \txfm1,idct
870 ld1 {v0.8h,v1.8h}, [x10]
871.endif
2f99117f 872 mov x9, #32
3c9546df 873
a63da451
MS
874.ifc \txfm1\()_\txfm2,idct_idct
875 cmp w3, #10
876 b.le idct16x16_quarter_add_neon
877 cmp w3, #38
878 b.le idct16x16_half_add_neon
879.endif
880
3c9546df
MS
881.irp i, 0, 8
882 add x0, sp, #(\i*32)
cad42fad
MS
883.ifc \txfm1\()_\txfm2,idct_idct
884.if \i == 8
885 cmp w3, #38
886 b.le 1f
887.endif
888.endif
889 mov x1, #\i
3c9546df 890 add x2, x6, #(\i*2)
3c9546df
MS
891 bl \txfm1\()16_1d_8x16_pass1_neon
892.endr
893.ifc \txfm1\()_\txfm2,iadst_idct
894 ld1 {v0.8h,v1.8h}, [x10]
895.endif
cad42fad
MS
896
897.ifc \txfm1\()_\txfm2,idct_idct
898 b 3f
8991:
900 // Set v24-v31 to zero, for the in-register passthrough of
901 // coefficients to pass 2. Since we only do two slices, this can
902 // only ever happen for the second slice. So we only need to store
903 // zeros to the temp buffer for the second half of the buffer.
904 // Move x0 to the second half, and use x9 == 32 as increment.
905 add x0, x0, #16
906.irp i, 24, 25, 26, 27, 28, 29, 30, 31
907 movi_v \i, .16b, #0
908 st1 {v24.8h}, [x0], x9
909.endr
9103:
911.endif
912
3c9546df
MS
913.irp i, 0, 8
914 add x0, x4, #(\i)
915 mov x1, x5
916 add x2, sp, #(\i*2)
917 mov x3, #\i
918 bl \txfm2\()16_1d_8x16_pass2_neon
919.endr
920
921 add sp, sp, #512
922.ifnc \txfm1\()_\txfm2,idct_idct
923 ldp d8, d9, [sp], 0x10
924 ldp d10, d11, [sp], 0x10
925 ldp d12, d13, [sp], 0x10
926 ldp d14, d15, [sp], 0x10
927.endif
928 br x15
929endfunc
930.endm
931
932itxfm_func16x16 idct, idct
933itxfm_func16x16 iadst, idct
934itxfm_func16x16 idct, iadst
935itxfm_func16x16 iadst, iadst
936
a63da451
MS
937function idct16_1d_8x16_pass1_quarter_neon
938 mov x14, x30
939 movi v2.8h, #0
940.irp i, 16, 17, 18, 19
941 load_clear \i, x2, x9
942.endr
943
944 bl idct16_quarter
945
946 // Do two 8x8 transposes. Originally, v16-v31 contain the
947 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
948 // transposed 8x8 blocks.
949 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
950 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
951
952 // Store the transposed 8x8 blocks horizontally.
953 // The first 8x8 block is kept in registers for the second pass,
954 // store the rest in the temp buffer.
955 // Since only a 4x4 part of the input was nonzero, this means that
956 // only 4 rows are nonzero after transposing, and the second pass
957 // only reads the topmost 4 rows. Therefore only store the topmost
958 // 4 rows.
959 add x0, x0, #16
960.irp i, 24, 25, 26, 27
961 store \i, x0, x9
962.endr
963 br x14
964endfunc
965
966function idct16_1d_8x16_pass2_quarter_neon
967 mov x14, x30
968 cbz x3, 1f
969.irp i, 16, 17, 18, 19
970 load \i, x2, x9
971.endr
9721:
973
974 add x3, x0, x1
975 lsl x1, x1, #1
976 bl idct16_quarter
977
978 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
979 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
980
981 br x14
982endfunc
983
984function idct16_1d_8x16_pass1_half_neon
985 mov x14, x30
986 movi v2.8h, #0
987.irp i, 16, 17, 18, 19, 20, 21, 22, 23
988 load_clear \i, x2, x9
989.endr
990
991 bl idct16_half
992
993 // Do two 8x8 transposes. Originally, v16-v31 contain the
994 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
995 // transposed 8x8 blocks.
996 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
997 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
998
999 // Store the transposed 8x8 blocks horizontally.
1000 // The first 8x8 block is kept in registers for the second pass,
1001 // store the rest in the temp buffer.
1002 add x0, x0, #16
1003.irp i, 24, 25, 26, 27, 28, 29, 30, 31
1004 store \i, x0, x9
1005.endr
1006 br x14
1007endfunc
1008
1009function idct16_1d_8x16_pass2_half_neon
1010 mov x14, x30
1011 cbz x3, 1f
1012.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1013 load \i, x2, x9
1014.endr
10151:
1016
1017 add x3, x0, x1
1018 lsl x1, x1, #1
1019 bl idct16_half
1020
1021 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
1022 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
1023
1024 br x14
1025endfunc
1026
1027.macro idct16_partial size
1028function idct16x16_\size\()_add_neon
1029 add x0, sp, #(0*32)
1030 add x2, x6, #(0*2)
1031 bl idct16_1d_8x16_pass1_\size\()_neon
1032.irp i, 0, 8
1033 add x0, x4, #(\i)
1034 mov x1, x5
1035 add x2, sp, #(\i*2)
1036 mov x3, #\i
1037 bl idct16_1d_8x16_pass2_\size\()_neon
1038.endr
1039
1040 add sp, sp, #512
1041 br x15
1042endfunc
1043.endm
1044
1045idct16_partial quarter
1046idct16_partial half
3c9546df
MS
1047
1048function idct32x32_dc_add_neon
1049 movrel x4, idct_coeffs
1050 ld1 {v0.4h}, [x4]
1051
1052 movi v1.4h, #0
1053
ed8d2933 1054 ld1 {v2.h}[0], [x2]
3c9546df
MS
1055 smull v2.4s, v2.4h, v0.h[0]
1056 rshrn v2.4h, v2.4s, #14
1057 smull v2.4s, v2.4h, v0.h[0]
1058 rshrn v2.4h, v2.4s, #14
1059 dup v2.8h, v2.h[0]
1060 st1 {v1.h}[0], [x2]
1061
1062 srshr v0.8h, v2.8h, #6
1063
3fcf788f 1064 mov x3, x0
3c9546df
MS
1065 mov x4, #32
10661:
1067 // Loop to add the constant v0 into all 32x32 outputs
3fcf788f
MS
1068 subs x4, x4, #2
1069 ld1 {v1.16b,v2.16b}, [x0], x1
1070 uaddw v16.8h, v0.8h, v1.8b
1071 uaddw2 v17.8h, v0.8h, v1.16b
1072 ld1 {v3.16b,v4.16b}, [x0], x1
1073 uaddw v18.8h, v0.8h, v2.8b
1074 uaddw2 v19.8h, v0.8h, v2.16b
1075 uaddw v20.8h, v0.8h, v3.8b
1076 uaddw2 v21.8h, v0.8h, v3.16b
1077 uaddw v22.8h, v0.8h, v4.8b
1078 uaddw2 v23.8h, v0.8h, v4.16b
1079 sqxtun v1.8b, v16.8h
1080 sqxtun2 v1.16b, v17.8h
1081 sqxtun v2.8b, v18.8h
1082 sqxtun2 v2.16b, v19.8h
1083 sqxtun v3.8b, v20.8h
1084 sqxtun2 v3.16b, v21.8h
1085 st1 {v1.16b,v2.16b}, [x3], x1
1086 sqxtun v4.8b, v22.8h
1087 sqxtun2 v4.16b, v23.8h
1088 st1 {v3.16b,v4.16b}, [x3], x1
3c9546df
MS
1089 b.ne 1b
1090
1091 ret
1092endfunc
1093
a63da451
MS
1094.macro idct32_end
1095 butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a
1096 butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18
1097 butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a
1098 butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21
1099 butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a
1100 butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26
1101 butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a
1102 butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29
1103
1104 dmbutterfly v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
1105 dmbutterfly v3, v5, v0.h[1], v0.h[2], v24, v25, v30, v31 // v3 = t19, v5 = t28
1106 dmbutterfly v28, v6, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
1107 dmbutterfly v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1108
1109 butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24
1110 butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1111 butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16
1112 butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1113 butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21
1114 butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
1115 butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26
1116 butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20
1117
1118 dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20
1119 dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
1120 dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22
1121 dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
1122 ret
1123.endm
1124
11547601 1125function idct32_odd
3c9546df
MS
1126 ld1 {v0.8h,v1.8h}, [x11]
1127
1128 dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1129 dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1130 dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1131 dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1132 dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1133 dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1134 dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1135 dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1136
1137 ld1 {v0.8h}, [x10]
1138
1139 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
1140 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
1141 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
1142 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
1143 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
1144 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
1145 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
1146 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
1147
1148 dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1149 dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1150 dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1151 dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
a63da451
MS
1152 idct32_end
1153endfunc
3c9546df 1154
a63da451
MS
1155function idct32_odd_half
1156 ld1 {v0.8h,v1.8h}, [x11]
3c9546df 1157
a63da451
MS
1158 dmbutterfly_h1 v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1159 dmbutterfly_h2 v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1160 dmbutterfly_h1 v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1161 dmbutterfly_h2 v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1162 dmbutterfly_h1 v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1163 dmbutterfly_h2 v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1164 dmbutterfly_h1 v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1165 dmbutterfly_h2 v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
3c9546df 1166
a63da451 1167 ld1 {v0.8h}, [x10]
3c9546df 1168
a63da451
MS
1169 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
1170 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
1171 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
1172 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
1173 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
1174 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
1175 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
1176 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
1177
1178 dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1179 dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1180 dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1181 dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1182 idct32_end
11547601 1183endfunc
3c9546df 1184
a63da451
MS
1185function idct32_odd_quarter
1186 ld1 {v0.8h,v1.8h}, [x11]
1187
1188 dsmull_h v4, v5, v16, v0.h[0]
1189 dsmull_h v28, v29, v19, v0.h[7]
1190 dsmull_h v30, v31, v16, v0.h[1]
1191 dsmull_h v22, v23, v17, v1.h[6]
1192 dsmull_h v7, v6, v17, v1.h[7]
1193 dsmull_h v26, v27, v19, v0.h[6]
1194 dsmull_h v20, v21, v18, v1.h[0]
1195 dsmull_h v24, v25, v18, v1.h[1]
1196
1197 ld1 {v0.8h}, [x10]
1198
1199 neg v28.4s, v28.4s
1200 neg v29.4s, v29.4s
1201 neg v7.4s, v7.4s
1202 neg v6.4s, v6.4s
1203
1204 drshrn_h v4, v4, v5, #14
1205 drshrn_h v5, v28, v29, #14
1206 drshrn_h v29, v30, v31, #14
1207 drshrn_h v28, v22, v23, #14
1208 drshrn_h v7, v7, v6, #14
1209 drshrn_h v31, v26, v27, #14
1210 drshrn_h v6, v20, v21, #14
1211 drshrn_h v30, v24, v25, #14
1212
1213 dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[3], v0.h[4]
1214 dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[3], v0.h[4]
1215 drshrn_h v23, v16, v17, #14
1216 drshrn_h v24, v18, v19, #14
1217 neg v20.4s, v20.4s
1218 neg v21.4s, v21.4s
1219 drshrn_h v27, v27, v26, #14
1220 drshrn_h v20, v20, v21, #14
1221 dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[5], v0.h[6]
1222 drshrn_h v21, v16, v17, #14
1223 drshrn_h v26, v18, v19, #14
1224 dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[5], v0.h[6]
1225 drshrn_h v25, v16, v17, #14
1226 neg v18.4s, v18.4s
1227 neg v19.4s, v19.4s
1228 drshrn_h v22, v18, v19, #14
1229
1230 idct32_end
1231endfunc
1232
1233.macro idct32_funcs suffix
3c9546df
MS
1234// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
1235// The 32-point IDCT can be decomposed into two 16-point IDCTs;
1236// a normal IDCT16 with every other input component (the even ones, with
1237// each output written twice), followed by a separate 16-point IDCT
1238// of the odd inputs, added/subtracted onto the outputs of the first idct16.
1239// x0 = dst (temp buffer)
1240// x1 = unused
1241// x2 = src
2f99117f 1242// x9 = double input stride
3c9546df
MS
1243// x10 = idct_coeffs
1244// x11 = idct_coeffs + 32
a63da451 1245function idct32_1d_8x32_pass1\suffix\()_neon
11547601 1246 mov x14, x30
3c9546df
MS
1247 ld1 {v0.8h,v1.8h}, [x10]
1248
a63da451 1249 movi v2.8h, #0
3c9546df
MS
1250
1251 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
a63da451 1252.ifb \suffix
3c9546df 1253.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
a63da451 1254 load_clear \i, x2, x9
3c9546df 1255.endr
a63da451
MS
1256.endif
1257.ifc \suffix,_quarter
1258.irp i, 16, 17, 18, 19
1259 load_clear \i, x2, x9
1260.endr
1261.endif
1262.ifc \suffix,_half
1263.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1264 load_clear \i, x2, x9
1265.endr
1266.endif
3c9546df 1267
a63da451 1268 bl idct16\suffix
3c9546df
MS
1269
1270 // Do two 8x8 transposes. Originally, v16-v31 contain the
1271 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
1272 // two transposed 8x8 blocks.
1273 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
1274 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
1275
1276 // Store the registers a, b horizontally, followed by the
1277 // same registers b, a mirrored.
1278.macro store_rev a, b
1279 // There's no rev128 instruction, but we reverse each 64 bit
1280 // half, and then flip them using an ext with 8 bytes offset.
58d87e0f
MS
1281 rev64 v1.8h, \b
1282 st1 {\a}, [x0], #16
1283 rev64 v0.8h, \a
3c9546df 1284 ext v1.16b, v1.16b, v1.16b, #8
58d87e0f 1285 st1 {\b}, [x0], #16
3c9546df
MS
1286 ext v0.16b, v0.16b, v0.16b, #8
1287 st1 {v1.8h}, [x0], #16
1288 st1 {v0.8h}, [x0], #16
1289.endm
58d87e0f
MS
1290 store_rev v16.8h, v24.8h
1291 store_rev v17.8h, v25.8h
1292 store_rev v18.8h, v26.8h
1293 store_rev v19.8h, v27.8h
1294 store_rev v20.8h, v28.8h
1295 store_rev v21.8h, v29.8h
1296 store_rev v22.8h, v30.8h
1297 store_rev v23.8h, v31.8h
3c9546df
MS
1298 sub x0, x0, #512
1299.purgem store_rev
1300
1301 // Move x2 back to the start of the input, and move
1302 // to the first odd row
a63da451 1303.ifb \suffix
3c9546df 1304 sub x2, x2, x9, lsl #4
a63da451
MS
1305.endif
1306.ifc \suffix,_quarter
1307 sub x2, x2, x9, lsl #2
1308.endif
1309.ifc \suffix,_half
1310 sub x2, x2, x9, lsl #3
1311.endif
3c9546df
MS
1312 add x2, x2, #64
1313
a63da451 1314 movi v2.8h, #0
3c9546df 1315 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
a63da451 1316.ifb \suffix
3c9546df 1317.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
a63da451
MS
1318 load_clear \i, x2, x9
1319.endr
1320.endif
1321.ifc \suffix,_quarter
1322.irp i, 16, 17, 18, 19
1323 load_clear \i, x2, x9
1324.endr
1325.endif
1326.ifc \suffix,_half
1327.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1328 load_clear \i, x2, x9
3c9546df 1329.endr
a63da451 1330.endif
3c9546df 1331
a63da451 1332 bl idct32_odd\suffix
3c9546df 1333
0c0b87f1
MS
1334 transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
1335 transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
3c9546df
MS
1336
1337 // Store the registers a, b horizontally,
1338 // adding into the output first, and the mirrored,
1339 // subtracted from the output.
1340.macro store_rev a, b
1341 ld1 {v4.8h}, [x0]
58d87e0f
MS
1342 rev64 v1.8h, \b
1343 add v4.8h, v4.8h, \a
1344 rev64 v0.8h, \a
3c9546df
MS
1345 st1 {v4.8h}, [x0], #16
1346 ext v1.16b, v1.16b, v1.16b, #8
1347 ld1 {v5.8h}, [x0]
1348 ext v0.16b, v0.16b, v0.16b, #8
58d87e0f 1349 add v5.8h, v5.8h, \b
3c9546df
MS
1350 st1 {v5.8h}, [x0], #16
1351 ld1 {v6.8h}, [x0]
1352 sub v6.8h, v6.8h, v1.8h
1353 st1 {v6.8h}, [x0], #16
1354 ld1 {v7.8h}, [x0]
1355 sub v7.8h, v7.8h, v0.8h
1356 st1 {v7.8h}, [x0], #16
1357.endm
1358
58d87e0f
MS
1359 store_rev v31.8h, v23.8h
1360 store_rev v30.8h, v22.8h
1361 store_rev v29.8h, v21.8h
1362 store_rev v28.8h, v20.8h
1363 store_rev v27.8h, v19.8h
1364 store_rev v26.8h, v18.8h
1365 store_rev v25.8h, v17.8h
1366 store_rev v24.8h, v16.8h
3c9546df 1367.purgem store_rev
11547601 1368 br x14
3c9546df
MS
1369endfunc
1370
1371// This is mostly the same as 8x32_pass1, but without the transpose,
1372// and use the source as temp buffer between the two idct passes, and
1373// add into the destination.
1374// x0 = dst
1375// x1 = dst stride
1376// x2 = src (temp buffer)
2f99117f
MS
1377// x7 = negative double temp buffer stride
1378// x9 = double temp buffer stride
3c9546df
MS
1379// x10 = idct_coeffs
1380// x11 = idct_coeffs + 32
a63da451 1381function idct32_1d_8x32_pass2\suffix\()_neon
11547601 1382 mov x14, x30
3c9546df
MS
1383 ld1 {v0.8h,v1.8h}, [x10]
1384
3c9546df 1385 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
a63da451 1386.ifb \suffix
3c9546df 1387.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
a63da451 1388 load \i, x2, x9
3c9546df
MS
1389.endr
1390 sub x2, x2, x9, lsl #4
a63da451
MS
1391.endif
1392.ifc \suffix,_quarter
1393.irp i, 16, 17, 18, 19
1394 load \i, x2, x9
1395.endr
1396 sub x2, x2, x9, lsl #2
1397.endif
1398.ifc \suffix,_half
1399.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1400 load \i, x2, x9
1401.endr
1402 sub x2, x2, x9, lsl #3
1403.endif
3c9546df 1404
a63da451 1405 bl idct16\suffix
3c9546df 1406
3c9546df 1407.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
a63da451 1408 store \i, x2, x9
3c9546df
MS
1409.endr
1410
1411 sub x2, x2, x9, lsl #4
1412 add x2, x2, #64
1413
1414 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
a63da451 1415.ifb \suffix
3c9546df 1416.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
a63da451 1417 load \i, x2, x9
3c9546df
MS
1418.endr
1419 sub x2, x2, x9, lsl #4
a63da451
MS
1420.endif
1421.ifc \suffix,_quarter
1422.irp i, 16, 17, 18, 19
1423 load \i, x2, x9
1424.endr
1425 sub x2, x2, x9, lsl #2
1426.endif
1427.ifc \suffix,_half
1428.irp i, 16, 17, 18, 19, 20, 21, 22, 23
1429 load \i, x2, x9
1430.endr
1431 sub x2, x2, x9, lsl #3
1432.endif
3c9546df
MS
1433 sub x2, x2, #64
1434
a63da451 1435 bl idct32_odd\suffix
3c9546df 1436
3c9546df 1437.macro load_acc_store a, b, c, d, neg=0
2f99117f 1438.if \neg == 0
3c9546df
MS
1439 ld1 {v4.8h}, [x2], x9
1440 ld1 {v5.8h}, [x2], x9
58d87e0f 1441 add v4.8h, v4.8h, \a
3c9546df 1442 ld1 {v6.8h}, [x2], x9
58d87e0f 1443 add v5.8h, v5.8h, \b
3c9546df 1444 ld1 {v7.8h}, [x2], x9
58d87e0f
MS
1445 add v6.8h, v6.8h, \c
1446 add v7.8h, v7.8h, \d
3c9546df 1447.else
2f99117f
MS
1448 ld1 {v4.8h}, [x2], x7
1449 ld1 {v5.8h}, [x2], x7
58d87e0f 1450 sub v4.8h, v4.8h, \a
2f99117f 1451 ld1 {v6.8h}, [x2], x7
58d87e0f 1452 sub v5.8h, v5.8h, \b
2f99117f 1453 ld1 {v7.8h}, [x2], x7
58d87e0f
MS
1454 sub v6.8h, v6.8h, \c
1455 sub v7.8h, v7.8h, \d
3c9546df
MS
1456.endif
1457 ld1 {v0.8b}, [x0], x1
1458 ld1 {v1.8b}, [x0], x1
1459 srshr v4.8h, v4.8h, #6
1460 ld1 {v2.8b}, [x0], x1
1461 srshr v5.8h, v5.8h, #6
1462 uaddw v4.8h, v4.8h, v0.8b
1463 ld1 {v3.8b}, [x0], x1
1464 srshr v6.8h, v6.8h, #6
1465 uaddw v5.8h, v5.8h, v1.8b
1466 srshr v7.8h, v7.8h, #6
1467 sub x0, x0, x1, lsl #2
1468 uaddw v6.8h, v6.8h, v2.8b
1469 sqxtun v4.8b, v4.8h
1470 uaddw v7.8h, v7.8h, v3.8b
1471 sqxtun v5.8b, v5.8h
1472 st1 {v4.8b}, [x0], x1
1473 sqxtun v6.8b, v6.8h
1474 st1 {v5.8b}, [x0], x1
1475 sqxtun v7.8b, v7.8h
1476 st1 {v6.8b}, [x0], x1
1477 st1 {v7.8b}, [x0], x1
1478.endm
58d87e0f
MS
1479 load_acc_store v31.8h, v30.8h, v29.8h, v28.8h
1480 load_acc_store v27.8h, v26.8h, v25.8h, v24.8h
1481 load_acc_store v23.8h, v22.8h, v21.8h, v20.8h
1482 load_acc_store v19.8h, v18.8h, v17.8h, v16.8h
3c9546df 1483 sub x2, x2, x9
58d87e0f
MS
1484 load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1
1485 load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1
1486 load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
1487 load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
3c9546df 1488.purgem load_acc_store
11547601 1489 br x14
3c9546df 1490endfunc
a63da451
MS
1491.endm
1492
1493idct32_funcs
1494idct32_funcs _quarter
1495idct32_funcs _half
3c9546df 1496
cad42fad
MS
1497const min_eob_idct_idct_32, align=4
1498 .short 0, 34, 135, 336
1499endconst
1500
3c9546df 1501function ff_vp9_idct_idct_32x32_add_neon, export=1
4d960a11 1502 cmp w3, #1
3c9546df
MS
1503 b.eq idct32x32_dc_add_neon
1504
1505 movrel x10, idct_coeffs
1506 add x11, x10, #32
a0c443a3 1507 movrel x12, min_eob_idct_idct_32, 2
3c9546df
MS
1508
1509 mov x15, x30
1510
1511 stp d14, d15, [sp, #-0x10]!
1512 stp d12, d13, [sp, #-0x10]!
1513 stp d10, d11, [sp, #-0x10]!
1514 stp d8, d9, [sp, #-0x10]!
1515
1516 sub sp, sp, #2048
1517
1518 mov x4, x0
1519 mov x5, x1
1520 mov x6, x2
1521
2f99117f
MS
1522 // Double stride of the input, since we only read every other line
1523 mov x9, #128
1524 neg x7, x9
1525
a63da451
MS
1526 cmp w3, #34
1527 b.le idct32x32_quarter_add_neon
1528 cmp w3, #135
1529 b.le idct32x32_half_add_neon
1530
3c9546df
MS
1531.irp i, 0, 8, 16, 24
1532 add x0, sp, #(\i*64)
cad42fad
MS
1533.if \i > 0
1534 ldrh w1, [x12], #2
1535 cmp w3, w1
1536 mov x1, #(32 - \i)/4
1537 b.le 1f
1538.endif
3c9546df
MS
1539 add x2, x6, #(\i*2)
1540 bl idct32_1d_8x32_pass1_neon
1541.endr
cad42fad
MS
1542 b 3f
1543
15441:
1545 // Write zeros to the temp buffer for pass 2
1546 movi v16.8h, #0
1547 movi v17.8h, #0
1548 movi v18.8h, #0
1549 movi v19.8h, #0
15502:
1551 subs x1, x1, #1
1552.rept 4
1553 st1 {v16.8h-v19.8h}, [x0], #64
1554.endr
1555 b.ne 2b
15563:
3c9546df
MS
1557.irp i, 0, 8, 16, 24
1558 add x0, x4, #(\i)
1559 mov x1, x5
1560 add x2, sp, #(\i*2)
1561 bl idct32_1d_8x32_pass2_neon
1562.endr
1563
1564 add sp, sp, #2048
1565
1566 ldp d8, d9, [sp], 0x10
1567 ldp d10, d11, [sp], 0x10
1568 ldp d12, d13, [sp], 0x10
1569 ldp d14, d15, [sp], 0x10
1570
1571 br x15
1572endfunc
a63da451
MS
1573
1574.macro idct32_partial size
1575function idct32x32_\size\()_add_neon
1576 add x0, sp, #(0*64)
1577 add x2, x6, #(0*2)
1578 bl idct32_1d_8x32_pass1_\size\()_neon
1579.ifc \size,half
1580 add x0, sp, #(8*64)
1581 add x2, x6, #(8*2)
1582 bl idct32_1d_8x32_pass1_\size\()_neon
1583.endif
1584.irp i, 0, 8, 16, 24
1585 add x0, x4, #(\i)
1586 mov x1, x5
1587 add x2, sp, #(\i*2)
1588 bl idct32_1d_8x32_pass2_\size\()_neon
1589.endr
1590
1591 add sp, sp, #2048
1592
1593 ldp d8, d9, [sp], 0x10
1594 ldp d10, d11, [sp], 0x10
1595 ldp d12, d13, [sp], 0x10
1596 ldp d14, d15, [sp], 0x10
1597
1598 br x15
1599endfunc
1600.endm
1601
1602idct32_partial quarter
1603idct32_partial half