arm: vc1dsp: Add commas between macro arguments
[libav.git] / libavcodec / arm / vc1dsp_neon.S
CommitLineData
832e1906
MC
1/*
2 * VC1 NEON optimisations
3 *
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of Libav.
8 *
9 * Libav is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * Libav is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with Libav; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24#include "libavutil/arm/asm.S"
25#include "neon.S"
26
896a5bff
JG
27#include "config.h"
28
832e1906
MC
29@ Transpose rows into columns of a matrix of 16-bit elements. For 4x4, pass
30@ double-word registers, for 8x4, pass quad-word registers.
31.macro transpose16 r0, r1, r2, r3
32 @ At this point:
33 @ row[0] r0
34 @ row[1] r1
35 @ row[2] r2
36 @ row[3] r3
37
38 vtrn.16 \r0, \r1 @ first and second row
39 vtrn.16 \r2, \r3 @ third and fourth row
40 vtrn.32 \r0, \r2 @ first and third row
41 vtrn.32 \r1, \r3 @ second and fourth row
42
43 @ At this point, if registers are quad-word:
44 @ column[0] d0
45 @ column[1] d2
46 @ column[2] d4
47 @ column[3] d6
48 @ column[4] d1
49 @ column[5] d3
50 @ column[6] d5
51 @ column[7] d7
52
53 @ At this point, if registers are double-word:
54 @ column[0] d0
55 @ column[1] d1
56 @ column[2] d2
57 @ column[3] d3
58.endm
59
60@ ff_vc1_inv_trans_{4,8}x{4,8}_neon and overflow: The input values in the file
61@ are supposed to be in a specific range as to allow for 16-bit math without
62@ causing overflows, but sometimes the input values are just big enough to
63@ barely cause overflow in vadd instructions like:
64@
65@ vadd.i16 q0, q8, q10
66@ vshr.s16 q0, q0, #\rshift
67@
68@ To prevent these borderline cases from overflowing, we just need one more
69@ bit of precision, which is accomplished by replacing the sequence above with:
70@
71@ vhadd.s16 q0, q8, q10
72@ vshr.s16 q0, q0, #(\rshift -1)
73@
74@ This works because vhadd is a single instruction that adds, then shifts to
75@ the right once, all before writing the result to the destination register.
76@
77@ Even with this workaround, there were still some files that caused overflows
78@ in ff_vc1_inv_trans_8x8_neon. See the comments in ff_vc1_inv_trans_8x8_neon
79@ for the additional workaround.
80
81@ Takes 4 columns of 8 values each and operates on it. Modeled after the first
82@ for loop in vc1_inv_trans_4x8_c.
83@ Input columns: q0 q1 q2 q3
84@ Output columns: q0 q1 q2 q3
85@ Trashes: r12 q8 q9 q10 q11 q12 q13
86.macro vc1_inv_trans_4x8_helper add rshift
87 @ Compute temp1, temp2 and setup scalar #17, #22, #10
88 vadd.i16 q12, q0, q2 @ temp1 = src[0] + src[2]
89 movw r12, #17
90 vsub.i16 q13, q0, q2 @ temp2 = src[0] - src[2]
91 movt r12, #22
92 vmov.32 d0[0], r12
93 movw r12, #10
94 vmov.16 d1[0], r12
95
96 vmov.i16 q8, #\add @ t1 will accumulate here
97 vmov.i16 q9, #\add @ t2 will accumulate here
98
99 vmul.i16 q10, q1, d0[1] @ t3 = 22 * (src[1])
100 vmul.i16 q11, q3, d0[1] @ t4 = 22 * (src[3])
101
102 vmla.i16 q8, q12, d0[0] @ t1 = 17 * (temp1) + 4
103 vmla.i16 q9, q13, d0[0] @ t2 = 17 * (temp2) + 4
104
105 vmla.i16 q10, q3, d1[0] @ t3 += 10 * src[3]
106 vmls.i16 q11, q1, d1[0] @ t4 -= 10 * src[1]
107
108 vhadd.s16 q0, q8, q10 @ dst[0] = (t1 + t3) >> 1
109 vhsub.s16 q3, q8, q10 @ dst[3] = (t1 - t3) >> 1
110 vhsub.s16 q1, q9, q11 @ dst[1] = (t2 - t4) >> 1
111 vhadd.s16 q2, q9, q11 @ dst[2] = (t2 + t4) >> 1
112
113 @ Halving add/sub above already did one shift
114 vshr.s16 q0, q0, #(\rshift - 1) @ dst[0] >>= (rshift - 1)
115 vshr.s16 q3, q3, #(\rshift - 1) @ dst[3] >>= (rshift - 1)
116 vshr.s16 q1, q1, #(\rshift - 1) @ dst[1] >>= (rshift - 1)
117 vshr.s16 q2, q2, #(\rshift - 1) @ dst[2] >>= (rshift - 1)
118.endm
119
120@ Takes 8 columns of 4 values each and operates on it. Modeled after the second
121@ for loop in vc1_inv_trans_4x8_c.
122@ Input columns: d0 d2 d4 d6 d1 d3 d5 d7
123@ Output columns: d16 d17 d18 d19 d21 d20 d23 d22
124@ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7
125.macro vc1_inv_trans_8x4_helper add add1beforeshift rshift
126 @ At this point:
127 @ src[0] d0 overwritten later
128 @ src[8] d2
129 @ src[16] d4 overwritten later
130 @ src[24] d6
131 @ src[32] d1 overwritten later
132 @ src[40] d3
133 @ src[48] d5 overwritten later
134 @ src[56] d7
135
136 movw r12, #12
137 vmov.i16 q14, #\add @ t1|t2 will accumulate here
138 movt r12, #6
139
140 vadd.i16 d20, d0, d1 @ temp1 = src[0] + src[32]
141 vsub.i16 d21, d0, d1 @ temp2 = src[0] - src[32]
142 vmov.i32 d0[0], r12 @ 16-bit: d0[0] = #12, d0[1] = #6
143
144 vshl.i16 q15, q2, #4 @ t3|t4 = 16 * (src[16]|src[48])
145 vswp d4, d5 @ q2 = src[48]|src[16]
146 vmla.i16 q14, q10, d0[0] @ t1|t2 = 12 * (temp1|temp2) + 64
147 movw r12, #15
148 movt r12, #9
149 vmov.i32 d0[1], r12 @ 16-bit: d0[2] = #15, d0[3] = #9
150 vneg.s16 d31, d31 @ t4 = -t4
151 vmla.i16 q15, q2, d0[1] @ t3|t4 += 6 * (src[48]|src[16])
152
153 @ At this point:
154 @ d0[2] #15
155 @ d0[3] #9
156 @ q1 src[8]|src[40]
157 @ q3 src[24]|src[56]
158 @ q14 old t1|t2
159 @ q15 old t3|t4
160
161 vshl.i16 q8, q1, #4 @ t1|t2 = 16 * (src[8]|src[40])
162 vswp d2, d3 @ q1 = src[40]|src[8]
163 vshl.i16 q12, q3, #4 @ temp3a|temp4a = 16 * src[24]|src[56]
164 vswp d6, d7 @ q3 = src[56]|src[24]
165 vshl.i16 q13, q1, #2 @ temp3b|temp4b = 4 * (src[40]|src[8])
166 vshl.i16 q2, q3, #2 @ temp1|temp2 = 4 * (src[56]|src[24])
167 vswp d3, d6 @ q1 = src[40]|src[56], q3 = src[8]|src[24]
168 vsub.i16 q9, q13, q12 @ t3|t4 = - (temp3a|temp4a) + (temp3b|temp4b)
169 vadd.i16 q8, q8, q2 @ t1|t2 += temp1|temp2
170 vmul.i16 q12, q3, d0[3] @ temp3|temp4 = 9 * src[8]|src[24]
171 vmla.i16 q8, q1, d0[3] @ t1|t2 += 9 * (src[40]|src[56])
172 vswp d6, d7 @ q3 = src[24]|src[8]
173 vswp d2, d3 @ q1 = src[56]|src[40]
174
175 vsub.i16 q11, q14, q15 @ t8|t7 = old t1|t2 - old t3|t4
176 vadd.i16 q10, q14, q15 @ t5|t6 = old t1|t2 + old t3|t4
177 .if \add1beforeshift
178 vmov.i16 q15, #1
179 .endif
180
181 vadd.i16 d18, d18, d24 @ t3 += temp3
182 vsub.i16 d19, d19, d25 @ t4 -= temp4
183
184 vswp d22, d23 @ q11 = t7|t8
185
186 vneg.s16 d17, d17 @ t2 = -t2
187 vmla.i16 q9, q1, d0[2] @ t3|t4 += 15 * src[56]|src[40]
188 vmla.i16 q8, q3, d0[2] @ t1|t2 += 15 * src[24]|src[8]
189
190 @ At this point:
191 @ t1 d16
192 @ t2 d17
193 @ t3 d18
194 @ t4 d19
195 @ t5 d20
196 @ t6 d21
197 @ t7 d22
198 @ t8 d23
199 @ #1 q15
200
201 .if \add1beforeshift
202 vadd.i16 q3, q15, q10 @ line[7,6] = t5|t6 + 1
203 vadd.i16 q2, q15, q11 @ line[5,4] = t7|t8 + 1
204 .endif
205
206 @ Sometimes this overflows, so to get one additional bit of precision, use
207 @ a single instruction that both adds and shifts right (halving).
208 vhadd.s16 q1, q9, q11 @ line[2,3] = (t3|t4 + t7|t8) >> 1
209 vhadd.s16 q0, q8, q10 @ line[0,1] = (t1|t2 + t5|t6) >> 1
210 .if \add1beforeshift
211 vhsub.s16 q2, q2, q9 @ line[5,4] = (t7|t8 - t3|t4 + 1) >> 1
212 vhsub.s16 q3, q3, q8 @ line[7,6] = (t5|t6 - t1|t2 + 1) >> 1
213 .else
214 vhsub.s16 q2, q11, q9 @ line[5,4] = (t7|t8 - t3|t4) >> 1
215 vhsub.s16 q3, q10, q8 @ line[7,6] = (t5|t6 - t1|t2) >> 1
216 .endif
217
218 vshr.s16 q9, q1, #(\rshift - 1) @ one shift is already done by vhadd/vhsub above
219 vshr.s16 q8, q0, #(\rshift - 1)
220 vshr.s16 q10, q2, #(\rshift - 1)
221 vshr.s16 q11, q3, #(\rshift - 1)
222
223 @ At this point:
224 @ dst[0] d16
225 @ dst[1] d17
226 @ dst[2] d18
227 @ dst[3] d19
228 @ dst[4] d21
229 @ dst[5] d20
230 @ dst[6] d23
231 @ dst[7] d22
232.endm
233
234@ This is modeled after the first and second for loop in vc1_inv_trans_8x8_c.
235@ Input columns: q8, q9, q10, q11, q12, q13, q14, q15
236@ Output columns: q8, q9, q10, q11, q12, q13, q14, q15
237@ Trashes all NEON registers (and r12) except for: q4 q5 q6 q7
238.macro vc1_inv_trans_8x8_helper add add1beforeshift rshift
239 @ This actually computes half of t1, t2, t3, t4, as explained below
240 @ near `tNhalf`.
241 vmov.i16 q0, #(6 / 2) @ q0 = #6/2
242 vshl.i16 q1, q10, #3 @ t3 = 16/2 * src[16]
243 vshl.i16 q3, q14, #3 @ temp4 = 16/2 * src[48]
244 vmul.i16 q2, q10, q0 @ t4 = 6/2 * src[16]
245 vmla.i16 q1, q14, q0 @ t3 += 6/2 * src[48]
246 @ unused: q0, q10, q14
247 vmov.i16 q0, #(12 / 2) @ q0 = #12/2
248 vadd.i16 q10, q8, q12 @ temp1 = src[0] + src[32]
249 vsub.i16 q14, q8, q12 @ temp2 = src[0] - src[32]
250 @ unused: q8, q12
251 vmov.i16 q8, #(\add / 2) @ t1 will accumulate here
252 vmov.i16 q12, #(\add / 2) @ t2 will accumulate here
253 movw r12, #15
254 vsub.i16 q2, q2, q3 @ t4 = 6/2 * src[16] - 16/2 * src[48]
255 movt r12, #9
256 @ unused: q3
257 vmla.i16 q8, q10, q0 @ t1 = 12/2 * temp1 + add
258 vmla.i16 q12, q14, q0 @ t2 = 12/2 * temp2 + add
259 vmov.i32 d0[0], r12
260 @ unused: q3, q10, q14
261
262 @ At this point:
263 @ q0 d0=#15|#9
264 @ q1 old t3
265 @ q2 old t4
266 @ q3
267 @ q8 old t1
268 @ q9 src[8]
269 @ q10
270 @ q11 src[24]
271 @ q12 old t2
272 @ q13 src[40]
273 @ q14
274 @ q15 src[56]
275
276 @ unused: q3, q10, q14
277 movw r12, #16
278 vshl.i16 q3, q9, #4 @ t1 = 16 * src[8]
279 movt r12, #4
280 vshl.i16 q10, q9, #2 @ t4 = 4 * src[8]
281 vmov.i32 d1[0], r12
282 vmul.i16 q14, q9, d0[0] @ t2 = 15 * src[8]
283 vmul.i16 q9, q9, d0[1] @ t3 = 9 * src[8]
284 @ unused: none
285 vmla.i16 q3, q11, d0[0] @ t1 += 15 * src[24]
286 vmls.i16 q10, q11, d0[1] @ t4 -= 9 * src[24]
287 vmls.i16 q14, q11, d1[1] @ t2 -= 4 * src[24]
288 vmls.i16 q9, q11, d1[0] @ t3 -= 16 * src[24]
289 @ unused: q11
290 vmla.i16 q3, q13, d0[1] @ t1 += 9 * src[40]
291 vmla.i16 q10, q13, d0[0] @ t4 += 15 * src[40]
292 vmls.i16 q14, q13, d1[0] @ t2 -= 16 * src[40]
293 vmla.i16 q9, q13, d1[1] @ t3 += 4 * src[40]
294 @ unused: q11, q13
295
296 @ Compute t5, t6, t7, t8 from old t1, t2, t3, t4. Actually, it computes
297 @ half of t5, t6, t7, t8 since t1, t2, t3, t4 are halved.
298 vadd.i16 q11, q8, q1 @ t5 = t1 + t3
299 vsub.i16 q1, q8, q1 @ t8 = t1 - t3
300 vadd.i16 q13, q12, q2 @ t6 = t2 + t4
301 vsub.i16 q2, q12, q2 @ t7 = t2 - t4
302 @ unused: q8, q12
303
304 .if \add1beforeshift
305 vmov.i16 q12, #1
306 .endif
307
308 @ unused: q8
309 vmla.i16 q3, q15, d1[1] @ t1 += 4 * src[56]
310 vmls.i16 q14, q15, d0[1] @ t2 -= 9 * src[56]
311 vmla.i16 q9, q15, d0[0] @ t3 += 15 * src[56]
312 vmls.i16 q10, q15, d1[0] @ t4 -= 16 * src[56]
313 @ unused: q0, q8, q15
314
315 @ At this point:
316 @ t1 q3
317 @ t2 q14
318 @ t3 q9
319 @ t4 q10
320 @ t5half q11
321 @ t6half q13
322 @ t7half q2
323 @ t8half q1
324 @ #1 q12
325 @
326 @ tNhalf is half of the value of tN (as described in vc1_inv_trans_8x8_c).
327 @ This is done because sometimes files have input that causes tN + tM to
328 @ overflow. To avoid this overflow, we compute tNhalf, then compute
329 @ tNhalf + tM (which doesn't overflow), and then we use vhadd to compute
330 @ (tNhalf + (tNhalf + tM)) >> 1 which does not overflow because it is
331 @ one instruction.
332
333 @ For each pair of tN and tM, do:
334 @ lineA = t5half + t1
335 @ if add1beforeshift: t1 -= 1
336 @ lineA = (t5half + lineA) >> 1
337 @ lineB = t5half - t1
338 @ lineB = (t5half + lineB) >> 1
339 @ lineA >>= rshift - 1
340 @ lineB >>= rshift - 1
341
342 vadd.i16 q8, q11, q3 @ q8 = t5half + t1
343 .if \add1beforeshift
344 vsub.i16 q3, q3, q12 @ q3 = t1 - 1
345 .endif
346
347 vadd.i16 q0, q13, q14 @ q0 = t6half + t2
348 .if \add1beforeshift
349 vsub.i16 q14, q14, q12 @ q14 = t2 - 1
350 .endif
351
352 vadd.i16 q15, q2, q9 @ q15 = t7half + t3
353 .if \add1beforeshift
354 vsub.i16 q9, q9, q12 @ q9 = t3 - 1
355 .endif
356 @ unused: none
357
358 vhadd.s16 q8, q11, q8 @ q8 = (t5half + t5half + t1) >> 1
359 vsub.i16 q3, q11, q3 @ q3 = t5half - t1 + 1
360
361 vhadd.s16 q0, q13, q0 @ q0 = (t6half + t6half + t2) >> 1
362 vsub.i16 q14, q13, q14 @ q14 = t6half - t2 + 1
363
364 vhadd.s16 q15, q2, q15 @ q15 = (t7half + t7half + t3) >> 1
365 vsub.i16 q9, q2, q9 @ q9 = t7half - t3 + 1
366
367 vhadd.s16 q3, q11, q3 @ q3 = (t5half + t5half - t1 + 1) >> 1
368 @ unused: q11
369
370 vadd.i16 q11, q1, q10 @ q11 = t8half + t4
371 .if \add1beforeshift
372 vsub.i16 q10, q10, q12 @ q10 = t4 - 1
373 .endif
374 @ unused: q12
375
376 vhadd.s16 q14, q13, q14 @ q14 = (t6half + t6half - t2 + 1) >> 1
377 @ unused: q12, q13
378 vhadd.s16 q13, q2, q9 @ q9 = (t7half + t7half - t3 + 1) >> 1
379 @ unused: q12, q2, q9
380
381 vsub.i16 q10, q1, q10 @ q10 = t8half - t4 + 1
382 vhadd.s16 q11, q1, q11 @ q11 = (t8half + t8half + t4) >> 1
383
384 vshr.s16 q8, q8, #(\rshift - 1) @ q8 = line[0]
385 vhadd.s16 q12, q1, q10 @ q12 = (t8half + t8half - t4 + 1) >> 1
386 vshr.s16 q9, q0, #(\rshift - 1) @ q9 = line[1]
387 vshr.s16 q10, q15, #(\rshift - 1) @ q10 = line[2]
388 vshr.s16 q11, q11, #(\rshift - 1) @ q11 = line[3]
389 vshr.s16 q12, q12, #(\rshift - 1) @ q12 = line[4]
390 vshr.s16 q13, q13, #(\rshift - 1) @ q13 = line[5]
391 vshr.s16 q14, q14, #(\rshift - 1) @ q14 = line[6]
392 vshr.s16 q15, q3, #(\rshift - 1) @ q15 = line[7]
393.endm
394
395@ (int16_t *block [r0])
396function ff_vc1_inv_trans_8x8_neon, export=1
397 vld1.64 {q8-q9}, [r0,:128]!
398 vld1.64 {q10-q11}, [r0,:128]!
399 vld1.64 {q12-q13}, [r0,:128]!
400 vld1.64 {q14-q15}, [r0,:128]
401 sub r0, r0, #(16 * 2 * 3) @ restore r0
402
403 @ At this point:
404 @ src[0] q8
405 @ src[8] q9
406 @ src[16] q10
407 @ src[24] q11
408 @ src[32] q12
409 @ src[40] q13
410 @ src[48] q14
411 @ src[56] q15
412
ab05d393 413 vc1_inv_trans_8x8_helper add=4, add1beforeshift=0, rshift=3
832e1906
MC
414
415 @ Transpose result matrix of 8x8
416 swap4 d17, d19, d21, d23, d24, d26, d28, d30
417 transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
418
ab05d393 419 vc1_inv_trans_8x8_helper add=64, add1beforeshift=1, rshift=7
832e1906
MC
420
421 vst1.64 {q8-q9}, [r0,:128]!
422 vst1.64 {q10-q11}, [r0,:128]!
423 vst1.64 {q12-q13}, [r0,:128]!
424 vst1.64 {q14-q15}, [r0,:128]
425
426 bx lr
427endfunc
428
2ec9fa5e 429@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
832e1906
MC
430function ff_vc1_inv_trans_8x4_neon, export=1
431 vld1.64 {q0-q1}, [r2,:128]! @ load 8 * 4 * 2 = 64 bytes / 16 bytes per quad = 4 quad registers
432 vld1.64 {q2-q3}, [r2,:128]
433
ab05d393 434 transpose16 q0, q1, q2, q3 @ transpose rows to columns
832e1906
MC
435
436 @ At this point:
437 @ src[0] d0
438 @ src[1] d2
439 @ src[2] d4
440 @ src[3] d6
441 @ src[4] d1
442 @ src[5] d3
443 @ src[6] d5
444 @ src[7] d7
445
ab05d393 446 vc1_inv_trans_8x4_helper add=4, add1beforeshift=0, rshift=3
832e1906
MC
447
448 @ Move output to more standardized registers
449 vmov d0, d16
450 vmov d2, d17
451 vmov d4, d18
452 vmov d6, d19
453 vmov d1, d21
454 vmov d3, d20
455 vmov d5, d23
456 vmov d7, d22
457
458 @ At this point:
459 @ dst[0] d0
460 @ dst[1] d2
461 @ dst[2] d4
462 @ dst[3] d6
463 @ dst[4] d1
464 @ dst[5] d3
465 @ dst[6] d5
466 @ dst[7] d7
467
ab05d393 468 transpose16 q0, q1, q2, q3 @ turn columns into rows
832e1906
MC
469
470 @ At this point:
471 @ row[0] q0
472 @ row[1] q1
473 @ row[2] q2
474 @ row[3] q3
475
ab05d393 476 vc1_inv_trans_4x8_helper add=64, rshift=7
832e1906
MC
477
478 @ At this point:
479 @ line[0].l d0
480 @ line[0].h d1
481 @ line[1].l d2
482 @ line[1].h d3
483 @ line[2].l d4
484 @ line[2].h d5
485 @ line[3].l d6
486 @ line[3].h d7
487
488 @ unused registers: q12, q13, q14, q15
489
490 vld1.64 {d28}, [r0,:64], r1 @ read dest
491 vld1.64 {d29}, [r0,:64], r1
492 vld1.64 {d30}, [r0,:64], r1
493 vld1.64 {d31}, [r0,:64], r1
494 sub r0, r0, r1, lsl #2 @ restore original r0 value
495
496 vaddw.u8 q0, q0, d28 @ line[0] += dest[0]
497 vaddw.u8 q1, q1, d29 @ line[1] += dest[1]
498 vaddw.u8 q2, q2, d30 @ line[2] += dest[2]
499 vaddw.u8 q3, q3, d31 @ line[3] += dest[3]
500
501 vqmovun.s16 d0, q0 @ line[0]
502 vqmovun.s16 d1, q1 @ line[1]
503 vqmovun.s16 d2, q2 @ line[2]
504 vqmovun.s16 d3, q3 @ line[3]
505
506 vst1.64 {d0}, [r0,:64], r1 @ write dest
507 vst1.64 {d1}, [r0,:64], r1
508 vst1.64 {d2}, [r0,:64], r1
509 vst1.64 {d3}, [r0,:64]
510
511 bx lr
512endfunc
513
2ec9fa5e 514@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
832e1906
MC
515function ff_vc1_inv_trans_4x8_neon, export=1
516 mov r12, #(8 * 2) @ 8 elements per line, each element 2 bytes
517 vld4.16 {d0[], d2[], d4[], d6[]}, [r2,:64], r12 @ read each column into a q register
518 vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r2,:64], r12
519 vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r2,:64], r12
520 vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r2,:64], r12
521 vld4.16 {d1[], d3[], d5[], d7[]}, [r2,:64], r12
522 vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r2,:64], r12
523 vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r2,:64], r12
524 vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r2,:64]
525
ab05d393 526 vc1_inv_trans_4x8_helper add=4, rshift=3
832e1906
MC
527
528 @ At this point:
529 @ dst[0] = q0
530 @ dst[1] = q1
531 @ dst[2] = q2
532 @ dst[3] = q3
533
ab05d393 534 transpose16 q0, q1, q2, q3 @ Transpose rows (registers) into columns
832e1906 535
ab05d393 536 vc1_inv_trans_8x4_helper add=64, add1beforeshift=1, rshift=7
832e1906
MC
537
538 vld1.32 {d28[]}, [r0,:32], r1 @ read dest
539 vld1.32 {d28[1]}, [r0,:32], r1
540 vld1.32 {d29[]}, [r0,:32], r1
541 vld1.32 {d29[1]}, [r0,:32], r1
542
543 vld1.32 {d30[]}, [r0,:32], r1
544 vld1.32 {d30[0]}, [r0,:32], r1
545 vld1.32 {d31[]}, [r0,:32], r1
546 vld1.32 {d31[0]}, [r0,:32], r1
547 sub r0, r0, r1, lsl #3 @ restore original r0 value
548
549 vaddw.u8 q8, q8, d28 @ line[0,1] += dest[0,1]
550 vaddw.u8 q9, q9, d29 @ line[2,3] += dest[2,3]
551 vaddw.u8 q10, q10, d30 @ line[5,4] += dest[5,4]
552 vaddw.u8 q11, q11, d31 @ line[7,6] += dest[7,6]
553
554 vqmovun.s16 d16, q8 @ clip(line[0,1])
555 vqmovun.s16 d18, q9 @ clip(line[2,3])
556 vqmovun.s16 d20, q10 @ clip(line[5,4])
557 vqmovun.s16 d22, q11 @ clip(line[7,6])
558
559 vst1.32 {d16[0]}, [r0,:32], r1 @ write dest
560 vst1.32 {d16[1]}, [r0,:32], r1
561 vst1.32 {d18[0]}, [r0,:32], r1
562 vst1.32 {d18[1]}, [r0,:32], r1
563
564 vst1.32 {d20[1]}, [r0,:32], r1
565 vst1.32 {d20[0]}, [r0,:32], r1
566 vst1.32 {d22[1]}, [r0,:32], r1
567 vst1.32 {d22[0]}, [r0,:32]
568
569 bx lr
570endfunc
571
572@ Setup constants in registers which are used by vc1_inv_trans_4x4_helper
573.macro vc1_inv_trans_4x4_helper_setup
574 vmov.i16 q13, #17
575 vmov.i16 q14, #22
576 vmov.i16 d30, #10 @ only need double-word, not quad-word
577.endm
578
579@ This is modeled after the first for loop in vc1_inv_trans_4x4_c.
580.macro vc1_inv_trans_4x4_helper add rshift
581 vmov.i16 q2, #\add @ t1|t2 will accumulate here
582
583 vadd.i16 d16, d0, d1 @ temp1 = src[0] + src[2]
584 vsub.i16 d17, d0, d1 @ temp2 = src[0] - src[2]
585 vmul.i16 q3, q14, q1 @ t3|t4 = 22 * (src[1]|src[3])
586 vmla.i16 q2, q13, q8 @ t1|t2 = 17 * (temp1|temp2) + add
587 vmla.i16 d6, d30, d3 @ t3 += 10 * src[3]
588 vmls.i16 d7, d30, d2 @ t4 -= 10 * src[1]
589
590 vadd.i16 q0, q2, q3 @ dst[0,2] = (t1|t2 + t3|t4)
591 vsub.i16 q1, q2, q3 @ dst[3,1] = (t1|t2 - t3|t4)
592 vshr.s16 q0, q0, #\rshift @ dst[0,2] >>= rshift
593 vshr.s16 q1, q1, #\rshift @ dst[3,1] >>= rshift
594.endm
595
2ec9fa5e 596@ (uint8_t *dest [r0], ptrdiff_t stride [r1], int16_t *block [r2])
832e1906
MC
597function ff_vc1_inv_trans_4x4_neon, export=1
598 mov r12, #(8 * 2) @ 8 elements per line, each element 2 bytes
599 vld4.16 {d0[], d1[], d2[], d3[]}, [r2,:64], r12 @ read each column into a register
600 vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r12
601 vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r12
602 vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2,:64]
603
604 vswp d1, d2 @ so that we can later access column 1 and column 3 as a single q1 register
605
606 vc1_inv_trans_4x4_helper_setup
607
608 @ At this point:
609 @ src[0] = d0
610 @ src[1] = d2
611 @ src[2] = d1
612 @ src[3] = d3
613
ab05d393 614 vc1_inv_trans_4x4_helper add=4, rshift=3 @ compute t1, t2, t3, t4 and combine them into dst[0-3]
832e1906
MC
615
616 @ At this point:
617 @ dst[0] = d0
618 @ dst[1] = d3
619 @ dst[2] = d1
620 @ dst[3] = d2
621
ab05d393 622 transpose16 d0, d3, d1, d2 @ Transpose rows (registers) into columns
832e1906
MC
623
624 @ At this point:
625 @ src[0] = d0
626 @ src[8] = d3
627 @ src[16] = d1
628 @ src[24] = d2
629
630 vswp d2, d3 @ so that we can later access column 1 and column 3 in order as a single q1 register
631
632 @ At this point:
633 @ src[0] = d0
634 @ src[8] = d2
635 @ src[16] = d1
636 @ src[24] = d3
637
ab05d393 638 vc1_inv_trans_4x4_helper add=64, rshift=7 @ compute t1, t2, t3, t4 and combine them into dst[0-3]
832e1906
MC
639
640 @ At this point:
641 @ line[0] = d0
642 @ line[1] = d3
643 @ line[2] = d1
644 @ line[3] = d2
645
646 vld1.32 {d18[]}, [r0,:32], r1 @ read dest
647 vld1.32 {d19[]}, [r0,:32], r1
648 vld1.32 {d18[1]}, [r0,:32], r1
649 vld1.32 {d19[0]}, [r0,:32], r1
650 sub r0, r0, r1, lsl #2 @ restore original r0 value
651
652 vaddw.u8 q0, q0, d18 @ line[0,2] += dest[0,2]
653 vaddw.u8 q1, q1, d19 @ line[3,1] += dest[3,1]
654
655 vqmovun.s16 d0, q0 @ clip(line[0,2])
656 vqmovun.s16 d1, q1 @ clip(line[3,1])
657
658 vst1.32 {d0[0]}, [r0,:32], r1 @ write dest
659 vst1.32 {d1[1]}, [r0,:32], r1
660 vst1.32 {d0[1]}, [r0,:32], r1
661 vst1.32 {d1[0]}, [r0,:32]
662
663 bx lr
664endfunc
665
666@ The absolute value of multiplication constants from vc1_mspel_filter and vc1_mspel_{ver,hor}_filter_16bits.
667@ The sign is embedded in the code below that carries out the multiplication (mspel_filter{,.16}).
ab05d393
MS
668#define MSPEL_MODE_1_MUL_CONSTANTS 4, 53, 18, 3
669#define MSPEL_MODE_2_MUL_CONSTANTS 1, 9, 9, 1
670#define MSPEL_MODE_3_MUL_CONSTANTS 3, 18, 53, 4
832e1906
MC
671
672@ These constants are from reading the source code of vc1_mspel_mc and determining the value that
673@ is added to `rnd` to result in the variable `r`, and the value of the variable `shift`.
ab05d393
MS
674#define MSPEL_MODES_11_ADDSHIFT_CONSTANTS 15, 5
675#define MSPEL_MODES_12_ADDSHIFT_CONSTANTS 3, 3
676#define MSPEL_MODES_13_ADDSHIFT_CONSTANTS 15, 5
832e1906 677#define MSPEL_MODES_21_ADDSHIFT_CONSTANTS MSPEL_MODES_12_ADDSHIFT_CONSTANTS
ab05d393
MS
678#define MSPEL_MODES_22_ADDSHIFT_CONSTANTS 0, 1
679#define MSPEL_MODES_23_ADDSHIFT_CONSTANTS 3, 3
832e1906
MC
680#define MSPEL_MODES_31_ADDSHIFT_CONSTANTS MSPEL_MODES_13_ADDSHIFT_CONSTANTS
681#define MSPEL_MODES_32_ADDSHIFT_CONSTANTS MSPEL_MODES_23_ADDSHIFT_CONSTANTS
ab05d393 682#define MSPEL_MODES_33_ADDSHIFT_CONSTANTS 15, 5
832e1906
MC
683
684@ The addition and shift constants from vc1_mspel_filter.
ab05d393
MS
685#define MSPEL_MODE_1_ADDSHIFT_CONSTANTS 32, 6
686#define MSPEL_MODE_2_ADDSHIFT_CONSTANTS 8, 4
687#define MSPEL_MODE_3_ADDSHIFT_CONSTANTS 32, 6
832e1906
MC
688
689@ Setup constants in registers for a subsequent use of mspel_filter{,.16}.
690.macro mspel_constants typesize reg_a reg_b reg_c reg_d filter_a filter_b filter_c filter_d reg_add filter_add_register
d7320ca3 691 @ Typesize should be i8 or i16.
832e1906
MC
692
693 @ Only set the register if the value is not 1 and unique
694 .if \filter_a != 1
d7320ca3 695 vmov.\typesize \reg_a, #\filter_a @ reg_a = filter_a
832e1906 696 .endif
d7320ca3 697 vmov.\typesize \reg_b, #\filter_b @ reg_b = filter_b
832e1906 698 .if \filter_b != \filter_c
d7320ca3 699 vmov.\typesize \reg_c, #\filter_c @ reg_c = filter_c
832e1906
MC
700 .endif
701 .if \filter_d != 1
d7320ca3 702 vmov.\typesize \reg_d, #\filter_d @ reg_d = filter_d
832e1906
MC
703 .endif
704 @ vdup to double the size of typesize
705 .ifc \typesize,i8
706 vdup.16 \reg_add, \filter_add_register @ reg_add = filter_add_register
707 .else
708 vdup.32 \reg_add, \filter_add_register @ reg_add = filter_add_register
709 .endif
832e1906
MC
710.endm
711
712@ After mspel_constants has been used, do the filtering.
713.macro mspel_filter acc dest src0 src1 src2 src3 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift narrow=1
714 .if \filter_a != 1
715 @ If filter_a != 1, then we need a move and subtract instruction
716 vmov \acc, \reg_add @ acc = reg_add
717 vmlsl.u8 \acc, \reg_a, \src0 @ acc -= filter_a * src[-stride]
718 .else
719 @ If filter_a is 1, then just subtract without an extra move
720 vsubw.u8 \acc, \reg_add, \src0 @ acc = reg_add - src[-stride] @ since filter_a == 1
721 .endif
722 vmlal.u8 \acc, \reg_b, \src1 @ acc += filter_b * src[0]
723 .if \filter_b != \filter_c
724 vmlal.u8 \acc, \reg_c, \src2 @ acc += filter_c * src[stride]
725 .else
726 @ If filter_b is the same as filter_c, use the same reg_b register
727 vmlal.u8 \acc, \reg_b, \src2 @ acc += filter_c * src[stride] @ where filter_c == filter_b
728 .endif
729 .if \filter_d != 1
730 @ If filter_d != 1, then do a multiply accumulate
731 vmlsl.u8 \acc, \reg_d, \src3 @ acc -= filter_d * src[stride * 2]
732 .else
733 @ If filter_d is 1, then just do a subtract
734 vsubw.u8 \acc, \acc, \src3 @ acc -= src[stride * 2] @ since filter_d == 1
735 .endif
736 .if \narrow
737 vqshrun.s16 \dest, \acc, #\filter_shift @ dest = clip_uint8(acc >> filter_shift)
738 .else
739 vshr.s16 \dest, \acc, #\filter_shift @ dest = acc >> filter_shift
740 .endif
741.endm
742
743@ This is similar to mspel_filter, but the input is 16-bit instead of 8-bit and narrow=0 is not supported.
744.macro mspel_filter.16 acc0 acc1 acc0_0 acc0_1 dest src0 src1 src2 src3 src4 src5 src6 src7 filter_a filter_b filter_c filter_d reg_a reg_b reg_c reg_d reg_add filter_shift
745 .if \filter_a != 1
746 vmov \acc0, \reg_add
747 vmov \acc1, \reg_add
748 vmlsl.s16 \acc0, \reg_a, \src0
749 vmlsl.s16 \acc1, \reg_a, \src1
750 .else
751 vsubw.s16 \acc0, \reg_add, \src0
752 vsubw.s16 \acc1, \reg_add, \src1
753 .endif
754 vmlal.s16 \acc0, \reg_b, \src2
755 vmlal.s16 \acc1, \reg_b, \src3
756 .if \filter_b != \filter_c
757 vmlal.s16 \acc0, \reg_c, \src4
758 vmlal.s16 \acc1, \reg_c, \src5
759 .else
760 vmlal.s16 \acc0, \reg_b, \src4
761 vmlal.s16 \acc1, \reg_b, \src5
762 .endif
763 .if \filter_d != 1
764 vmlsl.s16 \acc0, \reg_d, \src6
765 vmlsl.s16 \acc1, \reg_d, \src7
766 .else
767 vsubw.s16 \acc0, \acc0, \src6
768 vsubw.s16 \acc1, \acc1, \src7
769 .endif
770 @ Use acc0_0 and acc0_1 as temp space
771 vqshrun.s32 \acc0_0, \acc0, #\filter_shift @ Shift and narrow with saturation from s32 to u16
772 vqshrun.s32 \acc0_1, \acc1, #\filter_shift
773 vqmovn.u16 \dest, \acc0 @ Narrow with saturation from u16 to u8
774.endm
775
776@ Register usage for put_vc1_mspel_mc functions. Registers marked 'hv' are only used in put_vc1_mspel_mc_hv.
777@
778@ r0 adjusted dst
779@ r1 adjusted src
780@ r2 stride
781@ r3 adjusted rnd
782@ r4 [hv] tmp
783@ r11 [hv] sp saved
784@ r12 loop counter
785@ d0 src[-stride]
786@ d1 src[0]
787@ d2 src[stride]
788@ d3 src[stride * 2]
789@ q0 [hv] src[-stride]
790@ q1 [hv] src[0]
791@ q2 [hv] src[stride]
792@ q3 [hv] src[stride * 2]
793@ d21 often result from mspel_filter
794@ q11 accumulator 0
795@ q12 [hv] accumulator 1
796@ q13 accumulator initial value
797@ d28 filter_a
798@ d29 filter_b
799@ d30 filter_c
800@ d31 filter_d
801
802@ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3])
803.macro put_vc1_mspel_mc_hv hmode vmode filter_h_a filter_h_b filter_h_c filter_h_d filter_v_a filter_v_b filter_v_c filter_v_d filter_add filter_shift
804function ff_put_vc1_mspel_mc\hmode\()\vmode\()_neon, export=1
805 push {r4, r11, lr}
806 mov r11, sp @ r11 = stack pointer before realignmnet
807A bic sp, sp, #15 @ sp = round down to multiple of 16 bytes
808T bic r4, r11, #15
809T mov sp, r4
810 sub sp, sp, #(8*2*16) @ make space for 8 rows * 2 byte per element * 16 elements per row (to fit 11 actual elements per row)
811 mov r4, sp @ r4 = int16_t tmp[8 * 16]
812
813 sub r1, r1, #1 @ src -= 1
814 .if \filter_add != 0
815 add r3, r3, #\filter_add @ r3 = filter_add + rnd
816 .endif
817 mov r12, #8 @ loop counter
818 sub r1, r1, r2 @ r1 = &src[-stride] @ slide back
819
820 @ Do vertical filtering from src into tmp
ab05d393 821 mspel_constants i8, d28, d29, d30, d31, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, q13, r3
832e1906
MC
822
823 vld1.64 {d0,d1}, [r1], r2
824 vld1.64 {d2,d3}, [r1], r2
825 vld1.64 {d4,d5}, [r1], r2
826
8271:
828 subs r12, r12, #4
829
830 vld1.64 {d6,d7}, [r1], r2
ab05d393
MS
831 mspel_filter q11, q11, d0, d2, d4, d6, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
832 mspel_filter q12, q12, d1, d3, d5, d7, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
832e1906
MC
833 vst1.64 {q11,q12}, [r4,:128]! @ store and increment
834
835 vld1.64 {d0,d1}, [r1], r2
ab05d393
MS
836 mspel_filter q11, q11, d2, d4, d6, d0, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
837 mspel_filter q12, q12, d3, d5, d7, d1, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
832e1906
MC
838 vst1.64 {q11,q12}, [r4,:128]! @ store and increment
839
840 vld1.64 {d2,d3}, [r1], r2
ab05d393
MS
841 mspel_filter q11, q11, d4, d6, d0, d2, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
842 mspel_filter q12, q12, d5, d7, d1, d3, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
832e1906
MC
843 vst1.64 {q11,q12}, [r4,:128]! @ store and increment
844
845 vld1.64 {d4,d5}, [r1], r2
ab05d393
MS
846 mspel_filter q11, q11, d6, d0, d2, d4, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
847 mspel_filter q12, q12, d7, d1, d3, d5, \filter_v_a, \filter_v_b, \filter_v_c, \filter_v_d, d28, d29, d30, d31, q13, \filter_shift, narrow=0
832e1906
MC
848 vst1.64 {q11,q12}, [r4,:128]! @ store and increment
849
850 bne 1b
851
852 rsb r3, r3, #(64 + \filter_add) @ r3 = (64 + filter_add) - r3
853 mov r12, #8 @ loop counter
854 mov r4, sp @ r4 = tmp
855
856 @ Do horizontal filtering from temp to dst
ab05d393 857 mspel_constants i16, d28, d29, d30, d31, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, q13, r3
832e1906
MC
858
8592:
860 subs r12, r12, #1
861
862 vld1.64 {q0,q1}, [r4,:128]! @ read one line of tmp
863 vext.16 q2, q0, q1, #2
864 vext.16 q3, q0, q1, #3
865 vext.16 q1, q0, q1, #1 @ do last because it writes to q1 which is read by the other vext instructions
866
ab05d393 867 mspel_filter.16 q11, q12, d22, d23, d21, d0, d1, d2, d3, d4, d5, d6, d7, \filter_h_a, \filter_h_b, \filter_h_c, \filter_h_d, d28, d29, d30, d31, q13, 7
832e1906
MC
868
869 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
870
871 bne 2b
872
873 mov sp, r11
874 pop {r4, r11, pc}
875endfunc
876.endm
877
878@ Use C preprocessor and assembler macros to expand to functions for horizontal and vertical filtering.
879#define PUT_VC1_MSPEL_MC_HV(hmode, vmode) \
ab05d393
MS
880 put_vc1_mspel_mc_hv hmode, vmode, \
881 MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, \
882 MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, \
832e1906
MC
883 MSPEL_MODES_ ## hmode ## vmode ## _ADDSHIFT_CONSTANTS
884
885PUT_VC1_MSPEL_MC_HV(1, 1)
886PUT_VC1_MSPEL_MC_HV(1, 2)
887PUT_VC1_MSPEL_MC_HV(1, 3)
888PUT_VC1_MSPEL_MC_HV(2, 1)
889PUT_VC1_MSPEL_MC_HV(2, 2)
890PUT_VC1_MSPEL_MC_HV(2, 3)
891PUT_VC1_MSPEL_MC_HV(3, 1)
892PUT_VC1_MSPEL_MC_HV(3, 2)
893PUT_VC1_MSPEL_MC_HV(3, 3)
894
895#undef PUT_VC1_MSPEL_MC_HV
896
897.macro put_vc1_mspel_mc_h_only hmode filter_a filter_b filter_c filter_d filter_add filter_shift
898function ff_put_vc1_mspel_mc\hmode\()0_neon, export=1
899 rsb r3, r3, #\filter_add @ r3 = filter_add - r = filter_add - rnd
900 mov r12, #8 @ loop counter
901 sub r1, r1, #1 @ slide back, using immediate
902
ab05d393 903 mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3
832e1906
MC
904
9051:
906 subs r12, r12, #1
907
908 vld1.64 {d0,d1}, [r1], r2 @ read 16 bytes even though we only need 11, also src += stride
909 vext.8 d2, d0, d1, #2
910 vext.8 d3, d0, d1, #3
911 vext.8 d1, d0, d1, #1 @ do last because it writes to d1 which is read by the other vext instructions
912
ab05d393 913 mspel_filter q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
832e1906
MC
914
915 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
916
917 bne 1b
918
919 bx lr
920endfunc
921.endm
922
923@ Use C preprocessor and assembler macros to expand to functions for horizontal only filtering.
924#define PUT_VC1_MSPEL_MC_H_ONLY(hmode) \
ab05d393 925 put_vc1_mspel_mc_h_only hmode, MSPEL_MODE_ ## hmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## hmode ## _ADDSHIFT_CONSTANTS
832e1906
MC
926
927PUT_VC1_MSPEL_MC_H_ONLY(1)
928PUT_VC1_MSPEL_MC_H_ONLY(2)
929PUT_VC1_MSPEL_MC_H_ONLY(3)
930
931#undef PUT_VC1_MSPEL_MC_H_ONLY
932
933@ (uint8_t *dst [r0], const uint8_t *src [r1], ptrdiff_t stride [r2], int rnd [r3])
934.macro put_vc1_mspel_mc_v_only vmode filter_a filter_b filter_c filter_d filter_add filter_shift
935function ff_put_vc1_mspel_mc0\vmode\()_neon, export=1
936 add r3, r3, #\filter_add - 1 @ r3 = filter_add - r = filter_add - (1 - rnd) = filter_add - 1 + rnd
937 mov r12, #8 @ loop counter
938 sub r1, r1, r2 @ r1 = &src[-stride] @ slide back
939
ab05d393 940 mspel_constants i8, d28, d29, d30, d31, \filter_a, \filter_b, \filter_c, \filter_d, q13, r3
832e1906
MC
941
942 vld1.64 {d0}, [r1], r2 @ d0 = src[-stride]
943 vld1.64 {d1}, [r1], r2 @ d1 = src[0]
944 vld1.64 {d2}, [r1], r2 @ d2 = src[stride]
945
9461:
947 subs r12, r12, #4
948
949 vld1.64 {d3}, [r1], r2 @ d3 = src[stride * 2]
ab05d393 950 mspel_filter q11, d21, d0, d1, d2, d3, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
832e1906
MC
951 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
952
953 vld1.64 {d0}, [r1], r2 @ d0 = next line
ab05d393 954 mspel_filter q11, d21, d1, d2, d3, d0, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
832e1906
MC
955 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
956
957 vld1.64 {d1}, [r1], r2 @ d1 = next line
ab05d393 958 mspel_filter q11, d21, d2, d3, d0, d1, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
832e1906
MC
959 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
960
961 vld1.64 {d2}, [r1], r2 @ d2 = next line
ab05d393 962 mspel_filter q11, d21, d3, d0, d1, d2, \filter_a, \filter_b, \filter_c, \filter_d, d28, d29, d30, d31, q13, \filter_shift
832e1906
MC
963 vst1.64 {d21}, [r0,:64], r2 @ store and increment dst
964
965 bne 1b
966
967 bx lr
968endfunc
969.endm
970
971@ Use C preprocessor and assembler macros to expand to functions for vertical only filtering.
972#define PUT_VC1_MSPEL_MC_V_ONLY(vmode) \
ab05d393 973 put_vc1_mspel_mc_v_only vmode, MSPEL_MODE_ ## vmode ## _MUL_CONSTANTS, MSPEL_MODE_ ## vmode ## _ADDSHIFT_CONSTANTS
832e1906
MC
974
975PUT_VC1_MSPEL_MC_V_ONLY(1)
976PUT_VC1_MSPEL_MC_V_ONLY(2)
977PUT_VC1_MSPEL_MC_V_ONLY(3)
978
979#undef PUT_VC1_MSPEL_MC_V_ONLY
980
981function ff_put_pixels8x8_neon, export=1
982 vld1.64 {d0}, [r1], r2
983 vld1.64 {d1}, [r1], r2
984 vld1.64 {d2}, [r1], r2
985 vld1.64 {d3}, [r1], r2
986 vld1.64 {d4}, [r1], r2
987 vld1.64 {d5}, [r1], r2
988 vld1.64 {d6}, [r1], r2
989 vld1.64 {d7}, [r1]
990 vst1.64 {d0}, [r0,:64], r2
991 vst1.64 {d1}, [r0,:64], r2
992 vst1.64 {d2}, [r0,:64], r2
993 vst1.64 {d3}, [r0,:64], r2
994 vst1.64 {d4}, [r0,:64], r2
995 vst1.64 {d5}, [r0,:64], r2
996 vst1.64 {d6}, [r0,:64], r2
997 vst1.64 {d7}, [r0,:64]
998 bx lr
999endfunc
1000
1001function ff_vc1_inv_trans_8x8_dc_neon, export=1
1002 ldrsh r2, [r2] @ int dc = block[0];
1003
1004 vld1.64 {d0}, [r0,:64], r1
1005 vld1.64 {d1}, [r0,:64], r1
1006 vld1.64 {d4}, [r0,:64], r1
1007 vld1.64 {d5}, [r0,:64], r1
1008
1009 add r2, r2, r2, lsl #1 @ dc = (3 * dc + 1) >> 1;
1010 vld1.64 {d6}, [r0,:64], r1
1011 add r2, r2, #1
1012 vld1.64 {d7}, [r0,:64], r1
1013 vld1.64 {d16}, [r0,:64], r1
1014 vld1.64 {d17}, [r0,:64], r1
1015 asr r2, r2, #1
1016
1017 sub r0, r0, r1, lsl #3 @ restore r0 to original value
1018
1019 add r2, r2, r2, lsl #1 @ dc = (3 * dc + 16) >> 5;
1020 add r2, r2, #16
1021 asr r2, r2, #5
1022
1023 vdup.16 q1, r2 @ dc
1024
1025 vaddw.u8 q9, q1, d0
1026 vaddw.u8 q10, q1, d1
1027 vaddw.u8 q11, q1, d4
1028 vaddw.u8 q12, q1, d5
1029 vqmovun.s16 d0, q9
1030 vqmovun.s16 d1, q10
1031 vqmovun.s16 d4, q11
1032 vst1.64 {d0}, [r0,:64], r1
1033 vqmovun.s16 d5, q12
1034 vst1.64 {d1}, [r0,:64], r1
1035 vaddw.u8 q13, q1, d6
1036 vst1.64 {d4}, [r0,:64], r1
1037 vaddw.u8 q14, q1, d7
1038 vst1.64 {d5}, [r0,:64], r1
1039 vaddw.u8 q15, q1, d16
1040 vaddw.u8 q1, q1, d17 @ this destroys q1
1041 vqmovun.s16 d6, q13
1042 vqmovun.s16 d7, q14
1043 vqmovun.s16 d16, q15
1044 vqmovun.s16 d17, q1
1045 vst1.64 {d6}, [r0,:64], r1
1046 vst1.64 {d7}, [r0,:64], r1
1047 vst1.64 {d16}, [r0,:64], r1
1048 vst1.64 {d17}, [r0,:64]
1049 bx lr
1050endfunc
1051
1052function ff_vc1_inv_trans_8x4_dc_neon, export=1
1053 ldrsh r2, [r2] @ int dc = block[0];
1054
1055 vld1.64 {d0}, [r0,:64], r1
1056 vld1.64 {d1}, [r0,:64], r1
1057 vld1.64 {d4}, [r0,:64], r1
1058 vld1.64 {d5}, [r0,:64], r1
1059
1060 add r2, r2, r2, lsl #1 @ dc = ( 3 * dc + 1) >> 1;
1061
1062 sub r0, r0, r1, lsl #2 @ restore r0 to original value
1063
1064 add r2, r2, #1
1065 asr r2, r2, #1
1066
1067 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 64) >> 7;
1068 add r2, r2, #64
1069 asr r2, r2, #7
1070
1071 vdup.16 q1, r2 @ dc
1072
1073 vaddw.u8 q3, q1, d0
1074 vaddw.u8 q8, q1, d1
1075 vaddw.u8 q9, q1, d4
1076 vaddw.u8 q10, q1, d5
1077 vqmovun.s16 d0, q3
1078 vqmovun.s16 d1, q8
1079 vqmovun.s16 d4, q9
1080 vst1.64 {d0}, [r0,:64], r1
1081 vqmovun.s16 d5, q10
1082 vst1.64 {d1}, [r0,:64], r1
1083 vst1.64 {d4}, [r0,:64], r1
1084 vst1.64 {d5}, [r0,:64]
1085 bx lr
1086endfunc
1087
1088function ff_vc1_inv_trans_4x8_dc_neon, export=1
1089 ldrsh r2, [r2] @ int dc = block[0];
1090
1091 vld1.32 {d0[]}, [r0,:32], r1
1092 vld1.32 {d1[]}, [r0,:32], r1
1093 vld1.32 {d0[1]}, [r0,:32], r1
1094 vld1.32 {d1[1]}, [r0,:32], r1
1095
1096 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 4) >> 3;
1097 vld1.32 {d4[]}, [r0,:32], r1
1098 add r2, r2, #4
1099 vld1.32 {d5[]}, [r0,:32], r1
1100 vld1.32 {d4[1]}, [r0,:32], r1
1101 asr r2, r2, #3
1102 vld1.32 {d5[1]}, [r0,:32], r1
1103
1104 add r2, r2, r2, lsl #1 @ dc = (12 * dc + 64) >> 7;
1105
1106 sub r0, r0, r1, lsl #3 @ restore r0 to original value
1107
1108 lsl r2, r2, #2
1109 add r2, r2, #64
1110 asr r2, r2, #7
1111
1112 vdup.16 q1, r2 @ dc
1113
1114 vaddw.u8 q3, q1, d0
1115 vaddw.u8 q8, q1, d1
1116 vaddw.u8 q9, q1, d4
1117 vaddw.u8 q10, q1, d5
1118 vqmovun.s16 d0, q3
1119 vst1.32 {d0[0]}, [r0,:32], r1
1120 vqmovun.s16 d1, q8
1121 vst1.32 {d1[0]}, [r0,:32], r1
1122 vqmovun.s16 d4, q9
1123 vst1.32 {d0[1]}, [r0,:32], r1
1124 vqmovun.s16 d5, q10
1125 vst1.32 {d1[1]}, [r0,:32], r1
1126 vst1.32 {d4[0]}, [r0,:32], r1
1127 vst1.32 {d5[0]}, [r0,:32], r1
1128 vst1.32 {d4[1]}, [r0,:32], r1
1129 vst1.32 {d5[1]}, [r0,:32]
1130 bx lr
1131endfunc
1132
1133function ff_vc1_inv_trans_4x4_dc_neon, export=1
1134 ldrsh r2, [r2] @ int dc = block[0];
1135
1136 vld1.32 {d0[]}, [r0,:32], r1
1137 vld1.32 {d1[]}, [r0,:32], r1
1138 vld1.32 {d0[1]}, [r0,:32], r1
1139 vld1.32 {d1[1]}, [r0,:32], r1
1140
1141 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 4) >> 3;
1142
1143 sub r0, r0, r1, lsl #2 @ restore r0 to original value
1144
1145 add r2, r2, #4
1146 asr r2, r2, #3
1147
1148 add r2, r2, r2, lsl #4 @ dc = (17 * dc + 64) >> 7;
1149 add r2, r2, #64
1150 asr r2, r2, #7
1151
1152 vdup.16 q1, r2 @ dc
1153
1154 vaddw.u8 q2, q1, d0
1155 vaddw.u8 q3, q1, d1
1156 vqmovun.s16 d0, q2
1157 vst1.32 {d0[0]}, [r0,:32], r1
1158 vqmovun.s16 d1, q3
1159 vst1.32 {d1[0]}, [r0,:32], r1
1160 vst1.32 {d0[1]}, [r0,:32], r1
1161 vst1.32 {d1[1]}, [r0,:32]
1162 bx lr
1163endfunc