ARM: set size of asm functions in object files
[libav.git] / libavcodec / arm / h264dsp_neon.S
1 /*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "asm.S"
22
23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24 vtrn.32 \r0, \r4
25 vtrn.32 \r1, \r5
26 vtrn.32 \r2, \r6
27 vtrn.32 \r3, \r7
28 vtrn.16 \r0, \r2
29 vtrn.16 \r1, \r3
30 vtrn.16 \r4, \r6
31 vtrn.16 \r5, \r7
32 vtrn.8 \r0, \r1
33 vtrn.8 \r2, \r3
34 vtrn.8 \r4, \r5
35 vtrn.8 \r6, \r7
36 .endm
37
38 .macro transpose_4x4 r0 r1 r2 r3
39 vtrn.16 \r0, \r2
40 vtrn.16 \r1, \r3
41 vtrn.8 \r0, \r1
42 vtrn.8 \r2, \r3
43 .endm
44
45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46 vswp \r0, \r4
47 vswp \r1, \r5
48 vswp \r2, \r6
49 vswp \r3, \r7
50 .endm
51
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53 vtrn.32 \r0, \r2
54 vtrn.32 \r1, \r3
55 vtrn.32 \r4, \r6
56 vtrn.32 \r5, \r7
57 vtrn.16 \r0, \r1
58 vtrn.16 \r2, \r3
59 vtrn.16 \r4, \r5
60 vtrn.16 \r6, \r7
61 .endm
62
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64 .macro h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
66 push {r4-r7, lr}
67 ldrd r4, [sp, #20]
68 .ifc \type,avg
69 mov lr, r0
70 .endif
71 pld [r1]
72 pld [r1, r2]
73
74 muls r7, r4, r5
75 rsb r6, r7, r5, lsl #3
76 rsb ip, r7, r4, lsl #3
77 sub r4, r7, r4, lsl #3
78 sub r4, r4, r5, lsl #3
79 add r4, r4, #64
80
81 beq 2f
82
83 add r5, r1, r2
84
85 vdup.8 d0, r4
86 lsl r4, r2, #1
87 vdup.8 d1, ip
88 vld1.64 {d4, d5}, [r1], r4
89 vdup.8 d2, r6
90 vld1.64 {d6, d7}, [r5], r4
91 vdup.8 d3, r7
92
93 vext.8 d5, d4, d5, #1
94 vext.8 d7, d6, d7, #1
95
96 1: pld [r5]
97 vmull.u8 q8, d4, d0
98 vmlal.u8 q8, d5, d1
99 vld1.64 {d4, d5}, [r1], r4
100 vmlal.u8 q8, d6, d2
101 vext.8 d5, d4, d5, #1
102 vmlal.u8 q8, d7, d3
103 vmull.u8 q9, d6, d0
104 subs r3, r3, #2
105 vmlal.u8 q9, d7, d1
106 vmlal.u8 q9, d4, d2
107 vmlal.u8 q9, d5, d3
108 vrshrn.u16 d16, q8, #6
109 vld1.64 {d6, d7}, [r5], r4
110 pld [r1]
111 vrshrn.u16 d17, q9, #6
112 .ifc \type,avg
113 vld1.64 {d20}, [lr,:64], r2
114 vld1.64 {d21}, [lr,:64], r2
115 vrhadd.u8 q8, q8, q10
116 .endif
117 vext.8 d7, d6, d7, #1
118 vst1.64 {d16}, [r0,:64], r2
119 vst1.64 {d17}, [r0,:64], r2
120 bgt 1b
121
122 pop {r4-r7, pc}
123
124 2: tst r6, r6
125 add ip, ip, r6
126 vdup.8 d0, r4
127 vdup.8 d1, ip
128
129 beq 4f
130
131 add r5, r1, r2
132 lsl r4, r2, #1
133 vld1.64 {d4}, [r1], r4
134 vld1.64 {d6}, [r5], r4
135
136 3: pld [r5]
137 vmull.u8 q8, d4, d0
138 vmlal.u8 q8, d6, d1
139 vld1.64 {d4}, [r1], r4
140 vmull.u8 q9, d6, d0
141 vmlal.u8 q9, d4, d1
142 vld1.64 {d6}, [r5], r4
143 vrshrn.u16 d16, q8, #6
144 vrshrn.u16 d17, q9, #6
145 .ifc \type,avg
146 vld1.64 {d20}, [lr,:64], r2
147 vld1.64 {d21}, [lr,:64], r2
148 vrhadd.u8 q8, q8, q10
149 .endif
150 subs r3, r3, #2
151 pld [r1]
152 vst1.64 {d16}, [r0,:64], r2
153 vst1.64 {d17}, [r0,:64], r2
154 bgt 3b
155
156 pop {r4-r7, pc}
157
158 4: vld1.64 {d4, d5}, [r1], r2
159 vld1.64 {d6, d7}, [r1], r2
160 vext.8 d5, d4, d5, #1
161 vext.8 d7, d6, d7, #1
162
163 5: pld [r1]
164 subs r3, r3, #2
165 vmull.u8 q8, d4, d0
166 vmlal.u8 q8, d5, d1
167 vld1.64 {d4, d5}, [r1], r2
168 vmull.u8 q9, d6, d0
169 vmlal.u8 q9, d7, d1
170 pld [r1]
171 vext.8 d5, d4, d5, #1
172 vrshrn.u16 d16, q8, #6
173 vrshrn.u16 d17, q9, #6
174 .ifc \type,avg
175 vld1.64 {d20}, [lr,:64], r2
176 vld1.64 {d21}, [lr,:64], r2
177 vrhadd.u8 q8, q8, q10
178 .endif
179 vld1.64 {d6, d7}, [r1], r2
180 vext.8 d7, d6, d7, #1
181 vst1.64 {d16}, [r0,:64], r2
182 vst1.64 {d17}, [r0,:64], r2
183 bgt 5b
184
185 pop {r4-r7, pc}
186 endfunc
187 .endm
188
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190 .macro h264_chroma_mc4 type
191 function ff_\type\()_h264_chroma_mc4_neon, export=1
192 push {r4-r7, lr}
193 ldrd r4, [sp, #20]
194 .ifc \type,avg
195 mov lr, r0
196 .endif
197 pld [r1]
198 pld [r1, r2]
199
200 muls r7, r4, r5
201 rsb r6, r7, r5, lsl #3
202 rsb ip, r7, r4, lsl #3
203 sub r4, r7, r4, lsl #3
204 sub r4, r4, r5, lsl #3
205 add r4, r4, #64
206
207 beq 2f
208
209 add r5, r1, r2
210
211 vdup.8 d0, r4
212 lsl r4, r2, #1
213 vdup.8 d1, ip
214 vld1.64 {d4}, [r1], r4
215 vdup.8 d2, r6
216 vld1.64 {d6}, [r5], r4
217 vdup.8 d3, r7
218
219 vext.8 d5, d4, d5, #1
220 vext.8 d7, d6, d7, #1
221 vtrn.32 d4, d5
222 vtrn.32 d6, d7
223
224 vtrn.32 d0, d1
225 vtrn.32 d2, d3
226
227 1: pld [r5]
228 vmull.u8 q8, d4, d0
229 vmlal.u8 q8, d6, d2
230 vld1.64 {d4}, [r1], r4
231 vext.8 d5, d4, d5, #1
232 vtrn.32 d4, d5
233 vmull.u8 q9, d6, d0
234 vmlal.u8 q9, d4, d2
235 vld1.64 {d6}, [r5], r4
236 vadd.i16 d16, d16, d17
237 vadd.i16 d17, d18, d19
238 vrshrn.u16 d16, q8, #6
239 subs r3, r3, #2
240 pld [r1]
241 .ifc \type,avg
242 vld1.32 {d20[0]}, [lr,:32], r2
243 vld1.32 {d20[1]}, [lr,:32], r2
244 vrhadd.u8 d16, d16, d20
245 .endif
246 vext.8 d7, d6, d7, #1
247 vtrn.32 d6, d7
248 vst1.32 {d16[0]}, [r0,:32], r2
249 vst1.32 {d16[1]}, [r0,:32], r2
250 bgt 1b
251
252 pop {r4-r7, pc}
253
254 2: tst r6, r6
255 add ip, ip, r6
256 vdup.8 d0, r4
257 vdup.8 d1, ip
258 vtrn.32 d0, d1
259
260 beq 4f
261
262 vext.32 d1, d0, d1, #1
263 add r5, r1, r2
264 lsl r4, r2, #1
265 vld1.32 {d4[0]}, [r1], r4
266 vld1.32 {d4[1]}, [r5], r4
267
268 3: pld [r5]
269 vmull.u8 q8, d4, d0
270 vld1.32 {d4[0]}, [r1], r4
271 vmull.u8 q9, d4, d1
272 vld1.32 {d4[1]}, [r5], r4
273 vadd.i16 d16, d16, d17
274 vadd.i16 d17, d18, d19
275 vrshrn.u16 d16, q8, #6
276 .ifc \type,avg
277 vld1.32 {d20[0]}, [lr,:32], r2
278 vld1.32 {d20[1]}, [lr,:32], r2
279 vrhadd.u8 d16, d16, d20
280 .endif
281 subs r3, r3, #2
282 pld [r1]
283 vst1.32 {d16[0]}, [r0,:32], r2
284 vst1.32 {d16[1]}, [r0,:32], r2
285 bgt 3b
286
287 pop {r4-r7, pc}
288
289 4: vld1.64 {d4}, [r1], r2
290 vld1.64 {d6}, [r1], r2
291 vext.8 d5, d4, d5, #1
292 vext.8 d7, d6, d7, #1
293 vtrn.32 d4, d5
294 vtrn.32 d6, d7
295
296 5: vmull.u8 q8, d4, d0
297 vmull.u8 q9, d6, d0
298 subs r3, r3, #2
299 vld1.64 {d4}, [r1], r2
300 vext.8 d5, d4, d5, #1
301 vtrn.32 d4, d5
302 vadd.i16 d16, d16, d17
303 vadd.i16 d17, d18, d19
304 pld [r1]
305 vrshrn.u16 d16, q8, #6
306 .ifc \type,avg
307 vld1.32 {d20[0]}, [lr,:32], r2
308 vld1.32 {d20[1]}, [lr,:32], r2
309 vrhadd.u8 d16, d16, d20
310 .endif
311 vld1.64 {d6}, [r1], r2
312 vext.8 d7, d6, d7, #1
313 vtrn.32 d6, d7
314 pld [r1]
315 vst1.32 {d16[0]}, [r0,:32], r2
316 vst1.32 {d16[1]}, [r0,:32], r2
317 bgt 5b
318
319 pop {r4-r7, pc}
320 endfunc
321 .endm
322
323 .macro h264_chroma_mc2 type
324 function ff_\type\()_h264_chroma_mc2_neon, export=1
325 push {r4-r6, lr}
326 ldr r4, [sp, #16]
327 ldr lr, [sp, #20]
328 pld [r1]
329 pld [r1, r2]
330 orrs r5, r4, lr
331 beq 2f
332
333 mul r5, r4, lr
334 rsb r6, r5, lr, lsl #3
335 rsb r12, r5, r4, lsl #3
336 sub r4, r5, r4, lsl #3
337 sub r4, r4, lr, lsl #3
338 add r4, r4, #64
339 vdup.8 d0, r4
340 vdup.8 d2, r12
341 vdup.8 d1, r6
342 vdup.8 d3, r5
343 vtrn.16 q0, q1
344 1:
345 vld1.32 {d4[0]}, [r1], r2
346 vld1.32 {d4[1]}, [r1], r2
347 vrev64.32 d5, d4
348 vld1.32 {d5[1]}, [r1]
349 vext.8 q3, q2, q2, #1
350 vtrn.16 q2, q3
351 vmull.u8 q8, d4, d0
352 vmlal.u8 q8, d5, d1
353 .ifc \type,avg
354 vld1.16 {d18[0]}, [r0,:16], r2
355 vld1.16 {d18[1]}, [r0,:16]
356 sub r0, r0, r2
357 .endif
358 vtrn.32 d16, d17
359 vadd.i16 d16, d16, d17
360 vrshrn.u16 d16, q8, #6
361 .ifc \type,avg
362 vrhadd.u8 d16, d16, d18
363 .endif
364 vst1.16 {d16[0]}, [r0,:16], r2
365 vst1.16 {d16[1]}, [r0,:16], r2
366 subs r3, r3, #2
367 bgt 1b
368 pop {r4-r6, pc}
369 2:
370 .ifc \type,put
371 ldrh r5, [r1], r2
372 strh r5, [r0], r2
373 ldrh r6, [r1], r2
374 strh r6, [r0], r2
375 .else
376 vld1.16 {d16[0]}, [r1], r2
377 vld1.16 {d16[1]}, [r1], r2
378 vld1.16 {d18[0]}, [r0,:16], r2
379 vld1.16 {d18[1]}, [r0,:16]
380 sub r0, r0, r2
381 vrhadd.u8 d16, d16, d18
382 vst1.16 {d16[0]}, [r0,:16], r2
383 vst1.16 {d16[1]}, [r0,:16], r2
384 .endif
385 subs r3, r3, #2
386 bgt 2b
387 pop {r4-r6, pc}
388 endfunc
389 .endm
390
391 .text
392 .align
393
394 h264_chroma_mc8 put
395 h264_chroma_mc8 avg
396 h264_chroma_mc4 put
397 h264_chroma_mc4 avg
398 h264_chroma_mc2 put
399 h264_chroma_mc2 avg
400
401 /* H.264 loop filter */
402
403 .macro h264_loop_filter_start
404 ldr ip, [sp]
405 tst r2, r2
406 ldr ip, [ip]
407 tstne r3, r3
408 vmov.32 d24[0], ip
409 and ip, ip, ip, lsl #16
410 bxeq lr
411 ands ip, ip, ip, lsl #8
412 bxlt lr
413 .endm
414
415 .macro align_push_regs
416 and ip, sp, #15
417 add ip, ip, #32
418 sub sp, sp, ip
419 vst1.64 {d12-d15}, [sp,:128]
420 sub sp, sp, #32
421 vst1.64 {d8-d11}, [sp,:128]
422 .endm
423
424 .macro align_pop_regs
425 vld1.64 {d8-d11}, [sp,:128]!
426 vld1.64 {d12-d15}, [sp,:128], ip
427 .endm
428
429 .macro h264_loop_filter_luma
430 vdup.8 q11, r2 @ alpha
431 vmovl.u8 q12, d24
432 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
433 vmovl.u16 q12, d24
434 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
435 vsli.16 q12, q12, #8
436 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
437 vsli.32 q12, q12, #16
438 vclt.u8 q6, q6, q11 @ < alpha
439 vdup.8 q11, r3 @ beta
440 vclt.s8 q7, q12, #0
441 vclt.u8 q14, q14, q11 @ < beta
442 vclt.u8 q15, q15, q11 @ < beta
443 vbic q6, q6, q7
444 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
445 vand q6, q6, q14
446 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
447 vclt.u8 q4, q4, q11 @ < beta
448 vand q6, q6, q15
449 vclt.u8 q5, q5, q11 @ < beta
450 vand q4, q4, q6
451 vand q5, q5, q6
452 vand q12, q12, q6
453 vrhadd.u8 q14, q8, q0
454 vsub.i8 q6, q12, q4
455 vqadd.u8 q7, q9, q12
456 vhadd.u8 q10, q10, q14
457 vsub.i8 q6, q6, q5
458 vhadd.u8 q14, q2, q14
459 vmin.u8 q7, q7, q10
460 vqsub.u8 q11, q9, q12
461 vqadd.u8 q2, q1, q12
462 vmax.u8 q7, q7, q11
463 vqsub.u8 q11, q1, q12
464 vmin.u8 q14, q2, q14
465 vmovl.u8 q2, d0
466 vmax.u8 q14, q14, q11
467 vmovl.u8 q10, d1
468 vsubw.u8 q2, q2, d16
469 vsubw.u8 q10, q10, d17
470 vshl.i16 q2, q2, #2
471 vshl.i16 q10, q10, #2
472 vaddw.u8 q2, q2, d18
473 vaddw.u8 q10, q10, d19
474 vsubw.u8 q2, q2, d2
475 vsubw.u8 q10, q10, d3
476 vrshrn.i16 d4, q2, #3
477 vrshrn.i16 d5, q10, #3
478 vbsl q4, q7, q9
479 vbsl q5, q14, q1
480 vneg.s8 q7, q6
481 vmovl.u8 q14, d16
482 vmin.s8 q2, q2, q6
483 vmovl.u8 q6, d17
484 vmax.s8 q2, q2, q7
485 vmovl.u8 q11, d0
486 vmovl.u8 q12, d1
487 vaddw.s8 q14, q14, d4
488 vaddw.s8 q6, q6, d5
489 vsubw.s8 q11, q11, d4
490 vsubw.s8 q12, q12, d5
491 vqmovun.s16 d16, q14
492 vqmovun.s16 d17, q6
493 vqmovun.s16 d0, q11
494 vqmovun.s16 d1, q12
495 .endm
496
497 function ff_h264_v_loop_filter_luma_neon, export=1
498 h264_loop_filter_start
499
500 vld1.64 {d0, d1}, [r0,:128], r1
501 vld1.64 {d2, d3}, [r0,:128], r1
502 vld1.64 {d4, d5}, [r0,:128], r1
503 sub r0, r0, r1, lsl #2
504 sub r0, r0, r1, lsl #1
505 vld1.64 {d20,d21}, [r0,:128], r1
506 vld1.64 {d18,d19}, [r0,:128], r1
507 vld1.64 {d16,d17}, [r0,:128], r1
508
509 align_push_regs
510
511 h264_loop_filter_luma
512
513 sub r0, r0, r1, lsl #1
514 vst1.64 {d8, d9}, [r0,:128], r1
515 vst1.64 {d16,d17}, [r0,:128], r1
516 vst1.64 {d0, d1}, [r0,:128], r1
517 vst1.64 {d10,d11}, [r0,:128]
518
519 align_pop_regs
520 bx lr
521 endfunc
522
523 function ff_h264_h_loop_filter_luma_neon, export=1
524 h264_loop_filter_start
525
526 sub r0, r0, #4
527 vld1.64 {d6}, [r0], r1
528 vld1.64 {d20}, [r0], r1
529 vld1.64 {d18}, [r0], r1
530 vld1.64 {d16}, [r0], r1
531 vld1.64 {d0}, [r0], r1
532 vld1.64 {d2}, [r0], r1
533 vld1.64 {d4}, [r0], r1
534 vld1.64 {d26}, [r0], r1
535 vld1.64 {d7}, [r0], r1
536 vld1.64 {d21}, [r0], r1
537 vld1.64 {d19}, [r0], r1
538 vld1.64 {d17}, [r0], r1
539 vld1.64 {d1}, [r0], r1
540 vld1.64 {d3}, [r0], r1
541 vld1.64 {d5}, [r0], r1
542 vld1.64 {d27}, [r0], r1
543
544 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
545
546 align_push_regs
547
548 h264_loop_filter_luma
549
550 transpose_4x4 q4, q8, q0, q5
551
552 sub r0, r0, r1, lsl #4
553 add r0, r0, #2
554 vst1.32 {d8[0]}, [r0], r1
555 vst1.32 {d16[0]}, [r0], r1
556 vst1.32 {d0[0]}, [r0], r1
557 vst1.32 {d10[0]}, [r0], r1
558 vst1.32 {d8[1]}, [r0], r1
559 vst1.32 {d16[1]}, [r0], r1
560 vst1.32 {d0[1]}, [r0], r1
561 vst1.32 {d10[1]}, [r0], r1
562 vst1.32 {d9[0]}, [r0], r1
563 vst1.32 {d17[0]}, [r0], r1
564 vst1.32 {d1[0]}, [r0], r1
565 vst1.32 {d11[0]}, [r0], r1
566 vst1.32 {d9[1]}, [r0], r1
567 vst1.32 {d17[1]}, [r0], r1
568 vst1.32 {d1[1]}, [r0], r1
569 vst1.32 {d11[1]}, [r0], r1
570
571 align_pop_regs
572 bx lr
573 endfunc
574
575 .macro h264_loop_filter_chroma
576 vdup.8 d22, r2 @ alpha
577 vmovl.u8 q12, d24
578 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
579 vmovl.u8 q2, d0
580 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
581 vsubw.u8 q2, q2, d16
582 vsli.16 d24, d24, #8
583 vshl.i16 q2, q2, #2
584 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
585 vaddw.u8 q2, q2, d18
586 vclt.u8 d26, d26, d22 @ < alpha
587 vsubw.u8 q2, q2, d2
588 vdup.8 d22, r3 @ beta
589 vclt.s8 d25, d24, #0
590 vrshrn.i16 d4, q2, #3
591 vclt.u8 d28, d28, d22 @ < beta
592 vbic d26, d26, d25
593 vclt.u8 d30, d30, d22 @ < beta
594 vand d26, d26, d28
595 vneg.s8 d25, d24
596 vand d26, d26, d30
597 vmin.s8 d4, d4, d24
598 vmovl.u8 q14, d16
599 vand d4, d4, d26
600 vmax.s8 d4, d4, d25
601 vmovl.u8 q11, d0
602 vaddw.s8 q14, q14, d4
603 vsubw.s8 q11, q11, d4
604 vqmovun.s16 d16, q14
605 vqmovun.s16 d0, q11
606 .endm
607
608 function ff_h264_v_loop_filter_chroma_neon, export=1
609 h264_loop_filter_start
610
611 sub r0, r0, r1, lsl #1
612 vld1.64 {d18}, [r0,:64], r1
613 vld1.64 {d16}, [r0,:64], r1
614 vld1.64 {d0}, [r0,:64], r1
615 vld1.64 {d2}, [r0,:64]
616
617 h264_loop_filter_chroma
618
619 sub r0, r0, r1, lsl #1
620 vst1.64 {d16}, [r0,:64], r1
621 vst1.64 {d0}, [r0,:64], r1
622
623 bx lr
624 endfunc
625
626 function ff_h264_h_loop_filter_chroma_neon, export=1
627 h264_loop_filter_start
628
629 sub r0, r0, #2
630 vld1.32 {d18[0]}, [r0], r1
631 vld1.32 {d16[0]}, [r0], r1
632 vld1.32 {d0[0]}, [r0], r1
633 vld1.32 {d2[0]}, [r0], r1
634 vld1.32 {d18[1]}, [r0], r1
635 vld1.32 {d16[1]}, [r0], r1
636 vld1.32 {d0[1]}, [r0], r1
637 vld1.32 {d2[1]}, [r0], r1
638
639 vtrn.16 d18, d0
640 vtrn.16 d16, d2
641 vtrn.8 d18, d16
642 vtrn.8 d0, d2
643
644 h264_loop_filter_chroma
645
646 vtrn.16 d18, d0
647 vtrn.16 d16, d2
648 vtrn.8 d18, d16
649 vtrn.8 d0, d2
650
651 sub r0, r0, r1, lsl #3
652 vst1.32 {d18[0]}, [r0], r1
653 vst1.32 {d16[0]}, [r0], r1
654 vst1.32 {d0[0]}, [r0], r1
655 vst1.32 {d2[0]}, [r0], r1
656 vst1.32 {d18[1]}, [r0], r1
657 vst1.32 {d16[1]}, [r0], r1
658 vst1.32 {d0[1]}, [r0], r1
659 vst1.32 {d2[1]}, [r0], r1
660
661 bx lr
662 endfunc
663
664 /* H.264 qpel MC */
665
666 .macro lowpass_const r
667 movw \r, #5
668 movt \r, #20
669 vmov.32 d6[0], \r
670 .endm
671
672 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
673 .if \narrow
674 t0 .req q0
675 t1 .req q8
676 .else
677 t0 .req \d0
678 t1 .req \d1
679 .endif
680 vext.8 d2, \r0, \r1, #2
681 vext.8 d3, \r0, \r1, #3
682 vaddl.u8 q1, d2, d3
683 vext.8 d4, \r0, \r1, #1
684 vext.8 d5, \r0, \r1, #4
685 vaddl.u8 q2, d4, d5
686 vext.8 d30, \r0, \r1, #5
687 vaddl.u8 t0, \r0, d30
688 vext.8 d18, \r2, \r3, #2
689 vmla.i16 t0, q1, d6[1]
690 vext.8 d19, \r2, \r3, #3
691 vaddl.u8 q9, d18, d19
692 vext.8 d20, \r2, \r3, #1
693 vmls.i16 t0, q2, d6[0]
694 vext.8 d21, \r2, \r3, #4
695 vaddl.u8 q10, d20, d21
696 vext.8 d31, \r2, \r3, #5
697 vaddl.u8 t1, \r2, d31
698 vmla.i16 t1, q9, d6[1]
699 vmls.i16 t1, q10, d6[0]
700 .if \narrow
701 vqrshrun.s16 \d0, t0, #5
702 vqrshrun.s16 \d1, t1, #5
703 .endif
704 .unreq t0
705 .unreq t1
706 .endm
707
708 .macro lowpass_8_1 r0, r1, d0, narrow=1
709 .if \narrow
710 t0 .req q0
711 .else
712 t0 .req \d0
713 .endif
714 vext.8 d2, \r0, \r1, #2
715 vext.8 d3, \r0, \r1, #3
716 vaddl.u8 q1, d2, d3
717 vext.8 d4, \r0, \r1, #1
718 vext.8 d5, \r0, \r1, #4
719 vaddl.u8 q2, d4, d5
720 vext.8 d30, \r0, \r1, #5
721 vaddl.u8 t0, \r0, d30
722 vmla.i16 t0, q1, d6[1]
723 vmls.i16 t0, q2, d6[0]
724 .if \narrow
725 vqrshrun.s16 \d0, t0, #5
726 .endif
727 .unreq t0
728 .endm
729
730 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
731 vext.16 q1, \r0, \r1, #2
732 vext.16 q0, \r0, \r1, #3
733 vaddl.s16 q9, d2, d0
734 vext.16 q2, \r0, \r1, #1
735 vaddl.s16 q1, d3, d1
736 vext.16 q3, \r0, \r1, #4
737 vaddl.s16 q10, d4, d6
738 vext.16 \r1, \r0, \r1, #5
739 vaddl.s16 q2, d5, d7
740 vaddl.s16 q0, \h0, \h1
741 vaddl.s16 q8, \l0, \l1
742
743 vshl.i32 q3, q9, #4
744 vshl.i32 q9, q9, #2
745 vshl.i32 q15, q10, #2
746 vadd.i32 q9, q9, q3
747 vadd.i32 q10, q10, q15
748
749 vshl.i32 q3, q1, #4
750 vshl.i32 q1, q1, #2
751 vshl.i32 q15, q2, #2
752 vadd.i32 q1, q1, q3
753 vadd.i32 q2, q2, q15
754
755 vadd.i32 q9, q9, q8
756 vsub.i32 q9, q9, q10
757
758 vadd.i32 q1, q1, q0
759 vsub.i32 q1, q1, q2
760
761 vrshrn.s32 d18, q9, #10
762 vrshrn.s32 d19, q1, #10
763
764 vqmovun.s16 \d, q9
765 .endm
766
767 function put_h264_qpel16_h_lowpass_neon_packed
768 mov r4, lr
769 mov ip, #16
770 mov r3, #8
771 bl put_h264_qpel8_h_lowpass_neon
772 sub r1, r1, r2, lsl #4
773 add r1, r1, #8
774 mov ip, #16
775 mov lr, r4
776 b put_h264_qpel8_h_lowpass_neon
777 endfunc
778
779 .macro h264_qpel_h_lowpass type
780 function \type\()_h264_qpel16_h_lowpass_neon
781 push {lr}
782 mov ip, #16
783 bl \type\()_h264_qpel8_h_lowpass_neon
784 sub r0, r0, r3, lsl #4
785 sub r1, r1, r2, lsl #4
786 add r0, r0, #8
787 add r1, r1, #8
788 mov ip, #16
789 pop {lr}
790 endfunc
791
792 function \type\()_h264_qpel8_h_lowpass_neon
793 1: vld1.64 {d0, d1}, [r1], r2
794 vld1.64 {d16,d17}, [r1], r2
795 subs ip, ip, #2
796 lowpass_8 d0, d1, d16, d17, d0, d16
797 .ifc \type,avg
798 vld1.8 {d2}, [r0,:64], r3
799 vrhadd.u8 d0, d0, d2
800 vld1.8 {d3}, [r0,:64]
801 vrhadd.u8 d16, d16, d3
802 sub r0, r0, r3
803 .endif
804 vst1.64 {d0}, [r0,:64], r3
805 vst1.64 {d16}, [r0,:64], r3
806 bne 1b
807 bx lr
808 endfunc
809 .endm
810
811 h264_qpel_h_lowpass put
812 h264_qpel_h_lowpass avg
813
814 .macro h264_qpel_h_lowpass_l2 type
815 function \type\()_h264_qpel16_h_lowpass_l2_neon
816 push {lr}
817 mov ip, #16
818 bl \type\()_h264_qpel8_h_lowpass_l2_neon
819 sub r0, r0, r2, lsl #4
820 sub r1, r1, r2, lsl #4
821 sub r3, r3, r2, lsl #4
822 add r0, r0, #8
823 add r1, r1, #8
824 add r3, r3, #8
825 mov ip, #16
826 pop {lr}
827 endfunc
828
829 function \type\()_h264_qpel8_h_lowpass_l2_neon
830 1: vld1.64 {d0, d1}, [r1], r2
831 vld1.64 {d16,d17}, [r1], r2
832 vld1.64 {d28}, [r3], r2
833 vld1.64 {d29}, [r3], r2
834 subs ip, ip, #2
835 lowpass_8 d0, d1, d16, d17, d0, d1
836 vrhadd.u8 q0, q0, q14
837 .ifc \type,avg
838 vld1.8 {d2}, [r0,:64], r2
839 vrhadd.u8 d0, d0, d2
840 vld1.8 {d3}, [r0,:64]
841 vrhadd.u8 d1, d1, d3
842 sub r0, r0, r2
843 .endif
844 vst1.64 {d0}, [r0,:64], r2
845 vst1.64 {d1}, [r0,:64], r2
846 bne 1b
847 bx lr
848 endfunc
849 .endm
850
851 h264_qpel_h_lowpass_l2 put
852 h264_qpel_h_lowpass_l2 avg
853
854 function put_h264_qpel16_v_lowpass_neon_packed
855 mov r4, lr
856 mov r2, #8
857 bl put_h264_qpel8_v_lowpass_neon
858 sub r1, r1, r3, lsl #2
859 bl put_h264_qpel8_v_lowpass_neon
860 sub r1, r1, r3, lsl #4
861 sub r1, r1, r3, lsl #2
862 add r1, r1, #8
863 bl put_h264_qpel8_v_lowpass_neon
864 sub r1, r1, r3, lsl #2
865 mov lr, r4
866 b put_h264_qpel8_v_lowpass_neon
867 endfunc
868
869 .macro h264_qpel_v_lowpass type
870 function \type\()_h264_qpel16_v_lowpass_neon
871 mov r4, lr
872 bl \type\()_h264_qpel8_v_lowpass_neon
873 sub r1, r1, r3, lsl #2
874 bl \type\()_h264_qpel8_v_lowpass_neon
875 sub r0, r0, r2, lsl #4
876 add r0, r0, #8
877 sub r1, r1, r3, lsl #4
878 sub r1, r1, r3, lsl #2
879 add r1, r1, #8
880 bl \type\()_h264_qpel8_v_lowpass_neon
881 sub r1, r1, r3, lsl #2
882 mov lr, r4
883 endfunc
884
885 function \type\()_h264_qpel8_v_lowpass_neon
886 vld1.64 {d8}, [r1], r3
887 vld1.64 {d10}, [r1], r3
888 vld1.64 {d12}, [r1], r3
889 vld1.64 {d14}, [r1], r3
890 vld1.64 {d22}, [r1], r3
891 vld1.64 {d24}, [r1], r3
892 vld1.64 {d26}, [r1], r3
893 vld1.64 {d28}, [r1], r3
894 vld1.64 {d9}, [r1], r3
895 vld1.64 {d11}, [r1], r3
896 vld1.64 {d13}, [r1], r3
897 vld1.64 {d15}, [r1], r3
898 vld1.64 {d23}, [r1]
899
900 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
901 lowpass_8 d8, d9, d10, d11, d8, d10
902 lowpass_8 d12, d13, d14, d15, d12, d14
903 lowpass_8 d22, d23, d24, d25, d22, d24
904 lowpass_8 d26, d27, d28, d29, d26, d28
905 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
906
907 .ifc \type,avg
908 vld1.8 {d9}, [r0,:64], r2
909 vrhadd.u8 d8, d8, d9
910 vld1.8 {d11}, [r0,:64], r2
911 vrhadd.u8 d10, d10, d11
912 vld1.8 {d13}, [r0,:64], r2
913 vrhadd.u8 d12, d12, d13
914 vld1.8 {d15}, [r0,:64], r2
915 vrhadd.u8 d14, d14, d15
916 vld1.8 {d23}, [r0,:64], r2
917 vrhadd.u8 d22, d22, d23
918 vld1.8 {d25}, [r0,:64], r2
919 vrhadd.u8 d24, d24, d25
920 vld1.8 {d27}, [r0,:64], r2
921 vrhadd.u8 d26, d26, d27
922 vld1.8 {d29}, [r0,:64], r2
923 vrhadd.u8 d28, d28, d29
924 sub r0, r0, r2, lsl #3
925 .endif
926
927 vst1.64 {d8}, [r0,:64], r2
928 vst1.64 {d10}, [r0,:64], r2
929 vst1.64 {d12}, [r0,:64], r2
930 vst1.64 {d14}, [r0,:64], r2
931 vst1.64 {d22}, [r0,:64], r2
932 vst1.64 {d24}, [r0,:64], r2
933 vst1.64 {d26}, [r0,:64], r2
934 vst1.64 {d28}, [r0,:64], r2
935
936 bx lr
937 endfunc
938 .endm
939
940 h264_qpel_v_lowpass put
941 h264_qpel_v_lowpass avg
942
943 .macro h264_qpel_v_lowpass_l2 type
944 function \type\()_h264_qpel16_v_lowpass_l2_neon
945 mov r4, lr
946 bl \type\()_h264_qpel8_v_lowpass_l2_neon
947 sub r1, r1, r3, lsl #2
948 bl \type\()_h264_qpel8_v_lowpass_l2_neon
949 sub r0, r0, r3, lsl #4
950 sub ip, ip, r2, lsl #4
951 add r0, r0, #8
952 add ip, ip, #8
953 sub r1, r1, r3, lsl #4
954 sub r1, r1, r3, lsl #2
955 add r1, r1, #8
956 bl \type\()_h264_qpel8_v_lowpass_l2_neon
957 sub r1, r1, r3, lsl #2
958 mov lr, r4
959 endfunc
960
961 function \type\()_h264_qpel8_v_lowpass_l2_neon
962 vld1.64 {d8}, [r1], r3
963 vld1.64 {d10}, [r1], r3
964 vld1.64 {d12}, [r1], r3
965 vld1.64 {d14}, [r1], r3
966 vld1.64 {d22}, [r1], r3
967 vld1.64 {d24}, [r1], r3
968 vld1.64 {d26}, [r1], r3
969 vld1.64 {d28}, [r1], r3
970 vld1.64 {d9}, [r1], r3
971 vld1.64 {d11}, [r1], r3
972 vld1.64 {d13}, [r1], r3
973 vld1.64 {d15}, [r1], r3
974 vld1.64 {d23}, [r1]
975
976 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
977 lowpass_8 d8, d9, d10, d11, d8, d9
978 lowpass_8 d12, d13, d14, d15, d12, d13
979 lowpass_8 d22, d23, d24, d25, d22, d23
980 lowpass_8 d26, d27, d28, d29, d26, d27
981 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
982
983 vld1.64 {d0}, [ip], r2
984 vld1.64 {d1}, [ip], r2
985 vld1.64 {d2}, [ip], r2
986 vld1.64 {d3}, [ip], r2
987 vld1.64 {d4}, [ip], r2
988 vrhadd.u8 q0, q0, q4
989 vld1.64 {d5}, [ip], r2
990 vrhadd.u8 q1, q1, q6
991 vld1.64 {d10}, [ip], r2
992 vrhadd.u8 q2, q2, q11
993 vld1.64 {d11}, [ip], r2
994 vrhadd.u8 q5, q5, q13
995
996 .ifc \type,avg
997 vld1.8 {d16}, [r0,:64], r3
998 vrhadd.u8 d0, d0, d16
999 vld1.8 {d17}, [r0,:64], r3
1000 vrhadd.u8 d1, d1, d17
1001 vld1.8 {d16}, [r0,:64], r3
1002 vrhadd.u8 d2, d2, d16
1003 vld1.8 {d17}, [r0,:64], r3
1004 vrhadd.u8 d3, d3, d17
1005 vld1.8 {d16}, [r0,:64], r3
1006 vrhadd.u8 d4, d4, d16
1007 vld1.8 {d17}, [r0,:64], r3
1008 vrhadd.u8 d5, d5, d17
1009 vld1.8 {d16}, [r0,:64], r3
1010 vrhadd.u8 d10, d10, d16
1011 vld1.8 {d17}, [r0,:64], r3
1012 vrhadd.u8 d11, d11, d17
1013 sub r0, r0, r3, lsl #3
1014 .endif
1015
1016 vst1.64 {d0}, [r0,:64], r3
1017 vst1.64 {d1}, [r0,:64], r3
1018 vst1.64 {d2}, [r0,:64], r3
1019 vst1.64 {d3}, [r0,:64], r3
1020 vst1.64 {d4}, [r0,:64], r3
1021 vst1.64 {d5}, [r0,:64], r3
1022 vst1.64 {d10}, [r0,:64], r3
1023 vst1.64 {d11}, [r0,:64], r3
1024
1025 bx lr
1026 endfunc
1027 .endm
1028
1029 h264_qpel_v_lowpass_l2 put
1030 h264_qpel_v_lowpass_l2 avg
1031
1032 function put_h264_qpel8_hv_lowpass_neon_top
1033 lowpass_const ip
1034 mov ip, #12
1035 1: vld1.64 {d0, d1}, [r1], r3
1036 vld1.64 {d16,d17}, [r1], r3
1037 subs ip, ip, #2
1038 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
1039 vst1.64 {d22-d25}, [r4,:128]!
1040 bne 1b
1041
1042 vld1.64 {d0, d1}, [r1]
1043 lowpass_8_1 d0, d1, q12, narrow=0
1044
1045 mov ip, #-16
1046 add r4, r4, ip
1047 vld1.64 {d30,d31}, [r4,:128], ip
1048 vld1.64 {d20,d21}, [r4,:128], ip
1049 vld1.64 {d18,d19}, [r4,:128], ip
1050 vld1.64 {d16,d17}, [r4,:128], ip
1051 vld1.64 {d14,d15}, [r4,:128], ip
1052 vld1.64 {d12,d13}, [r4,:128], ip
1053 vld1.64 {d10,d11}, [r4,:128], ip
1054 vld1.64 {d8, d9}, [r4,:128], ip
1055 vld1.64 {d6, d7}, [r4,:128], ip
1056 vld1.64 {d4, d5}, [r4,:128], ip
1057 vld1.64 {d2, d3}, [r4,:128], ip
1058 vld1.64 {d0, d1}, [r4,:128]
1059
1060 swap4 d1, d3, d5, d7, d8, d10, d12, d14
1061 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
1062
1063 swap4 d17, d19, d21, d31, d24, d26, d28, d22
1064 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
1065
1066 vst1.64 {d30,d31}, [r4,:128]!
1067 vst1.64 {d6, d7}, [r4,:128]!
1068 vst1.64 {d20,d21}, [r4,:128]!
1069 vst1.64 {d4, d5}, [r4,:128]!
1070 vst1.64 {d18,d19}, [r4,:128]!
1071 vst1.64 {d2, d3}, [r4,:128]!
1072 vst1.64 {d16,d17}, [r4,:128]!
1073 vst1.64 {d0, d1}, [r4,:128]
1074
1075 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
1076 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
1077 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
1078 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
1079
1080 vld1.64 {d16,d17}, [r4,:128], ip
1081 vld1.64 {d30,d31}, [r4,:128], ip
1082 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
1083 vld1.64 {d16,d17}, [r4,:128], ip
1084 vld1.64 {d30,d31}, [r4,:128], ip
1085 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
1086 vld1.64 {d16,d17}, [r4,:128], ip
1087 vld1.64 {d30,d31}, [r4,:128], ip
1088 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
1089 vld1.64 {d16,d17}, [r4,:128], ip
1090 vld1.64 {d30,d31}, [r4,:128]
1091 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
1092
1093 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
1094
1095 bx lr
1096 endfunc
1097
1098 .macro h264_qpel8_hv_lowpass type
1099 function \type\()_h264_qpel8_hv_lowpass_neon
1100 mov r10, lr
1101 bl put_h264_qpel8_hv_lowpass_neon_top
1102 .ifc \type,avg
1103 vld1.8 {d0}, [r0,:64], r2
1104 vrhadd.u8 d12, d12, d0
1105 vld1.8 {d1}, [r0,:64], r2
1106 vrhadd.u8 d13, d13, d1
1107 vld1.8 {d2}, [r0,:64], r2
1108 vrhadd.u8 d14, d14, d2
1109 vld1.8 {d3}, [r0,:64], r2
1110 vrhadd.u8 d15, d15, d3
1111 vld1.8 {d4}, [r0,:64], r2
1112 vrhadd.u8 d8, d8, d4
1113 vld1.8 {d5}, [r0,:64], r2
1114 vrhadd.u8 d9, d9, d5
1115 vld1.8 {d6}, [r0,:64], r2
1116 vrhadd.u8 d10, d10, d6
1117 vld1.8 {d7}, [r0,:64], r2
1118 vrhadd.u8 d11, d11, d7
1119 sub r0, r0, r2, lsl #3
1120 .endif
1121 vst1.64 {d12}, [r0,:64], r2
1122 vst1.64 {d13}, [r0,:64], r2
1123 vst1.64 {d14}, [r0,:64], r2
1124 vst1.64 {d15}, [r0,:64], r2
1125 vst1.64 {d8}, [r0,:64], r2
1126 vst1.64 {d9}, [r0,:64], r2
1127 vst1.64 {d10}, [r0,:64], r2
1128 vst1.64 {d11}, [r0,:64], r2
1129
1130 mov lr, r10
1131 bx lr
1132 endfunc
1133 .endm
1134
1135 h264_qpel8_hv_lowpass put
1136 h264_qpel8_hv_lowpass avg
1137
1138 .macro h264_qpel8_hv_lowpass_l2 type
1139 function \type\()_h264_qpel8_hv_lowpass_l2_neon
1140 mov r10, lr
1141 bl put_h264_qpel8_hv_lowpass_neon_top
1142
1143 vld1.64 {d0, d1}, [r2,:128]!
1144 vld1.64 {d2, d3}, [r2,:128]!
1145 vrhadd.u8 q0, q0, q6
1146 vld1.64 {d4, d5}, [r2,:128]!
1147 vrhadd.u8 q1, q1, q7
1148 vld1.64 {d6, d7}, [r2,:128]!
1149 vrhadd.u8 q2, q2, q4
1150 vrhadd.u8 q3, q3, q5
1151 .ifc \type,avg
1152 vld1.8 {d16}, [r0,:64], r3
1153 vrhadd.u8 d0, d0, d16
1154 vld1.8 {d17}, [r0,:64], r3
1155 vrhadd.u8 d1, d1, d17
1156 vld1.8 {d18}, [r0,:64], r3
1157 vrhadd.u8 d2, d2, d18
1158 vld1.8 {d19}, [r0,:64], r3
1159 vrhadd.u8 d3, d3, d19
1160 vld1.8 {d20}, [r0,:64], r3
1161 vrhadd.u8 d4, d4, d20
1162 vld1.8 {d21}, [r0,:64], r3
1163 vrhadd.u8 d5, d5, d21
1164 vld1.8 {d22}, [r0,:64], r3
1165 vrhadd.u8 d6, d6, d22
1166 vld1.8 {d23}, [r0,:64], r3
1167 vrhadd.u8 d7, d7, d23
1168 sub r0, r0, r3, lsl #3
1169 .endif
1170 vst1.64 {d0}, [r0,:64], r3
1171 vst1.64 {d1}, [r0,:64], r3
1172 vst1.64 {d2}, [r0,:64], r3
1173 vst1.64 {d3}, [r0,:64], r3
1174 vst1.64 {d4}, [r0,:64], r3
1175 vst1.64 {d5}, [r0,:64], r3
1176 vst1.64 {d6}, [r0,:64], r3
1177 vst1.64 {d7}, [r0,:64], r3
1178
1179 mov lr, r10
1180 bx lr
1181 endfunc
1182 .endm
1183
1184 h264_qpel8_hv_lowpass_l2 put
1185 h264_qpel8_hv_lowpass_l2 avg
1186
1187 .macro h264_qpel16_hv type
1188 function \type\()_h264_qpel16_hv_lowpass_neon
1189 mov r9, lr
1190 bl \type\()_h264_qpel8_hv_lowpass_neon
1191 sub r1, r1, r3, lsl #2
1192 bl \type\()_h264_qpel8_hv_lowpass_neon
1193 sub r1, r1, r3, lsl #4
1194 sub r1, r1, r3, lsl #2
1195 add r1, r1, #8
1196 sub r0, r0, r2, lsl #4
1197 add r0, r0, #8
1198 bl \type\()_h264_qpel8_hv_lowpass_neon
1199 sub r1, r1, r3, lsl #2
1200 mov lr, r9
1201 b \type\()_h264_qpel8_hv_lowpass_neon
1202 endfunc
1203
1204 function \type\()_h264_qpel16_hv_lowpass_l2_neon
1205 mov r9, lr
1206 sub r2, r4, #256
1207 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1208 sub r1, r1, r3, lsl #2
1209 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1210 sub r1, r1, r3, lsl #4
1211 sub r1, r1, r3, lsl #2
1212 add r1, r1, #8
1213 sub r0, r0, r3, lsl #4
1214 add r0, r0, #8
1215 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1216 sub r1, r1, r3, lsl #2
1217 mov lr, r9
1218 b \type\()_h264_qpel8_hv_lowpass_l2_neon
1219 endfunc
1220 .endm
1221
1222 h264_qpel16_hv put
1223 h264_qpel16_hv avg
1224
1225 .macro h264_qpel8 type
1226 function ff_\type\()_h264_qpel8_mc10_neon, export=1
1227 lowpass_const r3
1228 mov r3, r1
1229 sub r1, r1, #2
1230 mov ip, #8
1231 b \type\()_h264_qpel8_h_lowpass_l2_neon
1232 endfunc
1233
1234 function ff_\type\()_h264_qpel8_mc20_neon, export=1
1235 lowpass_const r3
1236 sub r1, r1, #2
1237 mov r3, r2
1238 mov ip, #8
1239 b \type\()_h264_qpel8_h_lowpass_neon
1240 endfunc
1241
1242 function ff_\type\()_h264_qpel8_mc30_neon, export=1
1243 lowpass_const r3
1244 add r3, r1, #1
1245 sub r1, r1, #2
1246 mov ip, #8
1247 b \type\()_h264_qpel8_h_lowpass_l2_neon
1248 endfunc
1249
1250 function ff_\type\()_h264_qpel8_mc01_neon, export=1
1251 push {lr}
1252 mov ip, r1
1253 \type\()_h264_qpel8_mc01:
1254 lowpass_const r3
1255 mov r3, r2
1256 sub r1, r1, r2, lsl #1
1257 vpush {d8-d15}
1258 bl \type\()_h264_qpel8_v_lowpass_l2_neon
1259 vpop {d8-d15}
1260 pop {pc}
1261 endfunc
1262
1263 function ff_\type\()_h264_qpel8_mc11_neon, export=1
1264 push {r0, r1, r11, lr}
1265 \type\()_h264_qpel8_mc11:
1266 lowpass_const r3
1267 mov r11, sp
1268 bic sp, sp, #15
1269 sub sp, sp, #64
1270 mov r0, sp
1271 sub r1, r1, #2
1272 mov r3, #8
1273 mov ip, #8
1274 vpush {d8-d15}
1275 bl put_h264_qpel8_h_lowpass_neon
1276 ldrd r0, [r11]
1277 mov r3, r2
1278 add ip, sp, #64
1279 sub r1, r1, r2, lsl #1
1280 mov r2, #8
1281 bl \type\()_h264_qpel8_v_lowpass_l2_neon
1282 vpop {d8-d15}
1283 add sp, r11, #8
1284 pop {r11, pc}
1285 endfunc
1286
1287 function ff_\type\()_h264_qpel8_mc21_neon, export=1
1288 push {r0, r1, r4, r10, r11, lr}
1289 \type\()_h264_qpel8_mc21:
1290 lowpass_const r3
1291 mov r11, sp
1292 bic sp, sp, #15
1293 sub sp, sp, #(8*8+16*12)
1294 sub r1, r1, #2
1295 mov r3, #8
1296 mov r0, sp
1297 mov ip, #8
1298 vpush {d8-d15}
1299 bl put_h264_qpel8_h_lowpass_neon
1300 mov r4, r0
1301 ldrd r0, [r11]
1302 sub r1, r1, r2, lsl #1
1303 sub r1, r1, #2
1304 mov r3, r2
1305 sub r2, r4, #64
1306 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1307 vpop {d8-d15}
1308 add sp, r11, #8
1309 pop {r4, r10, r11, pc}
1310 endfunc
1311
1312 function ff_\type\()_h264_qpel8_mc31_neon, export=1
1313 add r1, r1, #1
1314 push {r0, r1, r11, lr}
1315 sub r1, r1, #1
1316 b \type\()_h264_qpel8_mc11
1317 endfunc
1318
1319 function ff_\type\()_h264_qpel8_mc02_neon, export=1
1320 push {lr}
1321 lowpass_const r3
1322 sub r1, r1, r2, lsl #1
1323 mov r3, r2
1324 vpush {d8-d15}
1325 bl \type\()_h264_qpel8_v_lowpass_neon
1326 vpop {d8-d15}
1327 pop {pc}
1328 endfunc
1329
1330 function ff_\type\()_h264_qpel8_mc12_neon, export=1
1331 push {r0, r1, r4, r10, r11, lr}
1332 \type\()_h264_qpel8_mc12:
1333 lowpass_const r3
1334 mov r11, sp
1335 bic sp, sp, #15
1336 sub sp, sp, #(8*8+16*12)
1337 sub r1, r1, r2, lsl #1
1338 mov r3, r2
1339 mov r2, #8
1340 mov r0, sp
1341 vpush {d8-d15}
1342 bl put_h264_qpel8_v_lowpass_neon
1343 mov r4, r0
1344 ldrd r0, [r11]
1345 sub r1, r1, r3, lsl #1
1346 sub r1, r1, #2
1347 sub r2, r4, #64
1348 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1349 vpop {d8-d15}
1350 add sp, r11, #8
1351 pop {r4, r10, r11, pc}
1352 endfunc
1353
1354 function ff_\type\()_h264_qpel8_mc22_neon, export=1
1355 push {r4, r10, r11, lr}
1356 mov r11, sp
1357 bic sp, sp, #15
1358 sub r1, r1, r2, lsl #1
1359 sub r1, r1, #2
1360 mov r3, r2
1361 sub sp, sp, #(16*12)
1362 mov r4, sp
1363 vpush {d8-d15}
1364 bl \type\()_h264_qpel8_hv_lowpass_neon
1365 vpop {d8-d15}
1366 mov sp, r11
1367 pop {r4, r10, r11, pc}
1368 endfunc
1369
1370 function ff_\type\()_h264_qpel8_mc32_neon, export=1
1371 push {r0, r1, r4, r10, r11, lr}
1372 add r1, r1, #1
1373 b \type\()_h264_qpel8_mc12
1374 endfunc
1375
1376 function ff_\type\()_h264_qpel8_mc03_neon, export=1
1377 push {lr}
1378 add ip, r1, r2
1379 b \type\()_h264_qpel8_mc01
1380 endfunc
1381
1382 function ff_\type\()_h264_qpel8_mc13_neon, export=1
1383 push {r0, r1, r11, lr}
1384 add r1, r1, r2
1385 b \type\()_h264_qpel8_mc11
1386 endfunc
1387
1388 function ff_\type\()_h264_qpel8_mc23_neon, export=1
1389 push {r0, r1, r4, r10, r11, lr}
1390 add r1, r1, r2
1391 b \type\()_h264_qpel8_mc21
1392 endfunc
1393
1394 function ff_\type\()_h264_qpel8_mc33_neon, export=1
1395 add r1, r1, #1
1396 push {r0, r1, r11, lr}
1397 add r1, r1, r2
1398 sub r1, r1, #1
1399 b \type\()_h264_qpel8_mc11
1400 endfunc
1401 .endm
1402
1403 h264_qpel8 put
1404 h264_qpel8 avg
1405
1406 .macro h264_qpel16 type
1407 function ff_\type\()_h264_qpel16_mc10_neon, export=1
1408 lowpass_const r3
1409 mov r3, r1
1410 sub r1, r1, #2
1411 b \type\()_h264_qpel16_h_lowpass_l2_neon
1412 endfunc
1413
1414 function ff_\type\()_h264_qpel16_mc20_neon, export=1
1415 lowpass_const r3
1416 sub r1, r1, #2
1417 mov r3, r2
1418 b \type\()_h264_qpel16_h_lowpass_neon
1419 endfunc
1420
1421 function ff_\type\()_h264_qpel16_mc30_neon, export=1
1422 lowpass_const r3
1423 add r3, r1, #1
1424 sub r1, r1, #2
1425 b \type\()_h264_qpel16_h_lowpass_l2_neon
1426 endfunc
1427
1428 function ff_\type\()_h264_qpel16_mc01_neon, export=1
1429 push {r4, lr}
1430 mov ip, r1
1431 \type\()_h264_qpel16_mc01:
1432 lowpass_const r3
1433 mov r3, r2
1434 sub r1, r1, r2, lsl #1
1435 vpush {d8-d15}
1436 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1437 vpop {d8-d15}
1438 pop {r4, pc}
1439 endfunc
1440
1441 function ff_\type\()_h264_qpel16_mc11_neon, export=1
1442 push {r0, r1, r4, r11, lr}
1443 \type\()_h264_qpel16_mc11:
1444 lowpass_const r3
1445 mov r11, sp
1446 bic sp, sp, #15
1447 sub sp, sp, #256
1448 mov r0, sp
1449 sub r1, r1, #2
1450 mov r3, #16
1451 vpush {d8-d15}
1452 bl put_h264_qpel16_h_lowpass_neon
1453 ldrd r0, [r11]
1454 mov r3, r2
1455 add ip, sp, #64
1456 sub r1, r1, r2, lsl #1
1457 mov r2, #16
1458 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1459 vpop {d8-d15}
1460 add sp, r11, #8
1461 pop {r4, r11, pc}
1462 endfunc
1463
1464 function ff_\type\()_h264_qpel16_mc21_neon, export=1
1465 push {r0, r1, r4-r5, r9-r11, lr}
1466 \type\()_h264_qpel16_mc21:
1467 lowpass_const r3
1468 mov r11, sp
1469 bic sp, sp, #15
1470 sub sp, sp, #(16*16+16*12)
1471 sub r1, r1, #2
1472 mov r0, sp
1473 vpush {d8-d15}
1474 bl put_h264_qpel16_h_lowpass_neon_packed
1475 mov r4, r0
1476 ldrd r0, [r11]
1477 sub r1, r1, r2, lsl #1
1478 sub r1, r1, #2
1479 mov r3, r2
1480 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1481 vpop {d8-d15}
1482 add sp, r11, #8
1483 pop {r4-r5, r9-r11, pc}
1484 endfunc
1485
1486 function ff_\type\()_h264_qpel16_mc31_neon, export=1
1487 add r1, r1, #1
1488 push {r0, r1, r4, r11, lr}
1489 sub r1, r1, #1
1490 b \type\()_h264_qpel16_mc11
1491 endfunc
1492
1493 function ff_\type\()_h264_qpel16_mc02_neon, export=1
1494 push {r4, lr}
1495 lowpass_const r3
1496 sub r1, r1, r2, lsl #1
1497 mov r3, r2
1498 vpush {d8-d15}
1499 bl \type\()_h264_qpel16_v_lowpass_neon
1500 vpop {d8-d15}
1501 pop {r4, pc}
1502 endfunc
1503
1504 function ff_\type\()_h264_qpel16_mc12_neon, export=1
1505 push {r0, r1, r4-r5, r9-r11, lr}
1506 \type\()_h264_qpel16_mc12:
1507 lowpass_const r3
1508 mov r11, sp
1509 bic sp, sp, #15
1510 sub sp, sp, #(16*16+16*12)
1511 sub r1, r1, r2, lsl #1
1512 mov r0, sp
1513 mov r3, r2
1514 vpush {d8-d15}
1515 bl put_h264_qpel16_v_lowpass_neon_packed
1516 mov r4, r0
1517 ldrd r0, [r11]
1518 sub r1, r1, r3, lsl #1
1519 sub r1, r1, #2
1520 mov r2, r3
1521 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1522 vpop {d8-d15}
1523 add sp, r11, #8
1524 pop {r4-r5, r9-r11, pc}
1525 endfunc
1526
1527 function ff_\type\()_h264_qpel16_mc22_neon, export=1
1528 push {r4, r9-r11, lr}
1529 lowpass_const r3
1530 mov r11, sp
1531 bic sp, sp, #15
1532 sub r1, r1, r2, lsl #1
1533 sub r1, r1, #2
1534 mov r3, r2
1535 sub sp, sp, #(16*12)
1536 mov r4, sp
1537 vpush {d8-d15}
1538 bl \type\()_h264_qpel16_hv_lowpass_neon
1539 vpop {d8-d15}
1540 mov sp, r11
1541 pop {r4, r9-r11, pc}
1542 endfunc
1543
1544 function ff_\type\()_h264_qpel16_mc32_neon, export=1
1545 push {r0, r1, r4-r5, r9-r11, lr}
1546 add r1, r1, #1
1547 b \type\()_h264_qpel16_mc12
1548 endfunc
1549
1550 function ff_\type\()_h264_qpel16_mc03_neon, export=1
1551 push {r4, lr}
1552 add ip, r1, r2
1553 b \type\()_h264_qpel16_mc01
1554 endfunc
1555
1556 function ff_\type\()_h264_qpel16_mc13_neon, export=1
1557 push {r0, r1, r4, r11, lr}
1558 add r1, r1, r2
1559 b \type\()_h264_qpel16_mc11
1560 endfunc
1561
1562 function ff_\type\()_h264_qpel16_mc23_neon, export=1
1563 push {r0, r1, r4-r5, r9-r11, lr}
1564 add r1, r1, r2
1565 b \type\()_h264_qpel16_mc21
1566 endfunc
1567
1568 function ff_\type\()_h264_qpel16_mc33_neon, export=1
1569 add r1, r1, #1
1570 push {r0, r1, r4, r11, lr}
1571 add r1, r1, r2
1572 sub r1, r1, #1
1573 b \type\()_h264_qpel16_mc11
1574 endfunc
1575 .endm
1576
1577 h264_qpel16 put
1578 h264_qpel16 avg
1579
1580 @ Biweighted prediction
1581
1582 .macro biweight_16 macs, macd
1583 vdup.8 d0, r4
1584 vdup.8 d1, r5
1585 vmov q2, q8
1586 vmov q3, q8
1587 1: subs ip, ip, #2
1588 vld1.8 {d20-d21},[r0,:128], r2
1589 \macd q2, d0, d20
1590 pld [r0]
1591 \macd q3, d0, d21
1592 vld1.8 {d22-d23},[r1,:128], r2
1593 \macs q2, d1, d22
1594 pld [r1]
1595 \macs q3, d1, d23
1596 vmov q12, q8
1597 vld1.8 {d28-d29},[r0,:128], r2
1598 vmov q13, q8
1599 \macd q12, d0, d28
1600 pld [r0]
1601 \macd q13, d0, d29
1602 vld1.8 {d30-d31},[r1,:128], r2
1603 \macs q12, d1, d30
1604 pld [r1]
1605 \macs q13, d1, d31
1606 vshl.s16 q2, q2, q9
1607 vshl.s16 q3, q3, q9
1608 vqmovun.s16 d4, q2
1609 vqmovun.s16 d5, q3
1610 vshl.s16 q12, q12, q9
1611 vshl.s16 q13, q13, q9
1612 vqmovun.s16 d24, q12
1613 vqmovun.s16 d25, q13
1614 vmov q3, q8
1615 vst1.8 {d4- d5}, [r6,:128], r2
1616 vmov q2, q8
1617 vst1.8 {d24-d25},[r6,:128], r2
1618 bne 1b
1619 pop {r4-r6, pc}
1620 .endm
1621
1622 .macro biweight_8 macs, macd
1623 vdup.8 d0, r4
1624 vdup.8 d1, r5
1625 vmov q1, q8
1626 vmov q10, q8
1627 1: subs ip, ip, #2
1628 vld1.8 {d4},[r0,:64], r2
1629 \macd q1, d0, d4
1630 pld [r0]
1631 vld1.8 {d5},[r1,:64], r2
1632 \macs q1, d1, d5
1633 pld [r1]
1634 vld1.8 {d6},[r0,:64], r2
1635 \macd q10, d0, d6
1636 pld [r0]
1637 vld1.8 {d7},[r1,:64], r2
1638 \macs q10, d1, d7
1639 pld [r1]
1640 vshl.s16 q1, q1, q9
1641 vqmovun.s16 d2, q1
1642 vshl.s16 q10, q10, q9
1643 vqmovun.s16 d4, q10
1644 vmov q10, q8
1645 vst1.8 {d2},[r6,:64], r2
1646 vmov q1, q8
1647 vst1.8 {d4},[r6,:64], r2
1648 bne 1b
1649 pop {r4-r6, pc}
1650 .endm
1651
1652 .macro biweight_4 macs, macd
1653 vdup.8 d0, r4
1654 vdup.8 d1, r5
1655 vmov q1, q8
1656 vmov q10, q8
1657 1: subs ip, ip, #4
1658 vld1.32 {d4[0]},[r0,:32], r2
1659 vld1.32 {d4[1]},[r0,:32], r2
1660 \macd q1, d0, d4
1661 pld [r0]
1662 vld1.32 {d5[0]},[r1,:32], r2
1663 vld1.32 {d5[1]},[r1,:32], r2
1664 \macs q1, d1, d5
1665 pld [r1]
1666 blt 2f
1667 vld1.32 {d6[0]},[r0,:32], r2
1668 vld1.32 {d6[1]},[r0,:32], r2
1669 \macd q10, d0, d6
1670 pld [r0]
1671 vld1.32 {d7[0]},[r1,:32], r2
1672 vld1.32 {d7[1]},[r1,:32], r2
1673 \macs q10, d1, d7
1674 pld [r1]
1675 vshl.s16 q1, q1, q9
1676 vqmovun.s16 d2, q1
1677 vshl.s16 q10, q10, q9
1678 vqmovun.s16 d4, q10
1679 vmov q10, q8
1680 vst1.32 {d2[0]},[r6,:32], r2
1681 vst1.32 {d2[1]},[r6,:32], r2
1682 vmov q1, q8
1683 vst1.32 {d4[0]},[r6,:32], r2
1684 vst1.32 {d4[1]},[r6,:32], r2
1685 bne 1b
1686 pop {r4-r6, pc}
1687 2: vshl.s16 q1, q1, q9
1688 vqmovun.s16 d2, q1
1689 vst1.32 {d2[0]},[r6,:32], r2
1690 vst1.32 {d2[1]},[r6,:32], r2
1691 pop {r4-r6, pc}
1692 .endm
1693
1694 .macro biweight_func w
1695 function biweight_h264_pixels_\w\()_neon
1696 push {r4-r6, lr}
1697 add r4, sp, #16
1698 ldm r4, {r4-r6}
1699 lsr lr, r4, #31
1700 add r6, r6, #1
1701 eors lr, lr, r5, lsr #30
1702 orr r6, r6, #1
1703 vdup.16 q9, r3
1704 lsl r6, r6, r3
1705 vmvn q9, q9
1706 vdup.16 q8, r6
1707 mov r6, r0
1708 beq 10f
1709 subs lr, lr, #1
1710 beq 20f
1711 subs lr, lr, #1
1712 beq 30f
1713 b 40f
1714 10: biweight_\w vmlal.u8, vmlal.u8
1715 20: rsb r4, r4, #0
1716 biweight_\w vmlal.u8, vmlsl.u8
1717 30: rsb r4, r4, #0
1718 rsb r5, r5, #0
1719 biweight_\w vmlsl.u8, vmlsl.u8
1720 40: rsb r5, r5, #0
1721 biweight_\w vmlsl.u8, vmlal.u8
1722 endfunc
1723 .endm
1724
1725 .macro biweight_entry w, h, b=1
1726 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1727 mov ip, #\h
1728 .if \b
1729 b biweight_h264_pixels_\w\()_neon
1730 .endif
1731 endfunc
1732 .endm
1733
1734 biweight_entry 16, 8
1735 biweight_entry 16, 16, b=0
1736 biweight_func 16
1737
1738 biweight_entry 8, 16
1739 biweight_entry 8, 4
1740 biweight_entry 8, 8, b=0
1741 biweight_func 8
1742
1743 biweight_entry 4, 8
1744 biweight_entry 4, 2
1745 biweight_entry 4, 4, b=0
1746 biweight_func 4
1747
1748 @ Weighted prediction
1749
1750 .macro weight_16 add
1751 vdup.8 d0, r3
1752 1: subs ip, ip, #2
1753 vld1.8 {d20-d21},[r0,:128], r1
1754 vmull.u8 q2, d0, d20
1755 pld [r0]
1756 vmull.u8 q3, d0, d21
1757 vld1.8 {d28-d29},[r0,:128], r1
1758 vmull.u8 q12, d0, d28
1759 pld [r0]
1760 vmull.u8 q13, d0, d29
1761 \add q2, q8, q2
1762 vrshl.s16 q2, q2, q9
1763 \add q3, q8, q3
1764 vrshl.s16 q3, q3, q9
1765 vqmovun.s16 d4, q2
1766 vqmovun.s16 d5, q3
1767 \add q12, q8, q12
1768 vrshl.s16 q12, q12, q9
1769 \add q13, q8, q13
1770 vrshl.s16 q13, q13, q9
1771 vqmovun.s16 d24, q12
1772 vqmovun.s16 d25, q13
1773 vst1.8 {d4- d5}, [r4,:128], r1
1774 vst1.8 {d24-d25},[r4,:128], r1
1775 bne 1b
1776 pop {r4, pc}
1777 .endm
1778
1779 .macro weight_8 add
1780 vdup.8 d0, r3
1781 1: subs ip, ip, #2
1782 vld1.8 {d4},[r0,:64], r1
1783 vmull.u8 q1, d0, d4
1784 pld [r0]
1785 vld1.8 {d6},[r0,:64], r1
1786 vmull.u8 q10, d0, d6
1787 \add q1, q8, q1
1788 pld [r0]
1789 vrshl.s16 q1, q1, q9
1790 vqmovun.s16 d2, q1
1791 \add q10, q8, q10
1792 vrshl.s16 q10, q10, q9
1793 vqmovun.s16 d4, q10
1794 vst1.8 {d2},[r4,:64], r1
1795 vst1.8 {d4},[r4,:64], r1
1796 bne 1b
1797 pop {r4, pc}
1798 .endm
1799
1800 .macro weight_4 add
1801 vdup.8 d0, r3
1802 vmov q1, q8
1803 vmov q10, q8
1804 1: subs ip, ip, #4
1805 vld1.32 {d4[0]},[r0,:32], r1
1806 vld1.32 {d4[1]},[r0,:32], r1
1807 vmull.u8 q1, d0, d4
1808 pld [r0]
1809 blt 2f
1810 vld1.32 {d6[0]},[r0,:32], r1
1811 vld1.32 {d6[1]},[r0,:32], r1
1812 vmull.u8 q10, d0, d6
1813 pld [r0]
1814 \add q1, q8, q1
1815 vrshl.s16 q1, q1, q9
1816 vqmovun.s16 d2, q1
1817 \add q10, q8, q10
1818 vrshl.s16 q10, q10, q9
1819 vqmovun.s16 d4, q10
1820 vmov q10, q8
1821 vst1.32 {d2[0]},[r4,:32], r1
1822 vst1.32 {d2[1]},[r4,:32], r1
1823 vmov q1, q8
1824 vst1.32 {d4[0]},[r4,:32], r1
1825 vst1.32 {d4[1]},[r4,:32], r1
1826 bne 1b
1827 pop {r4, pc}
1828 2: \add q1, q8, q1
1829 vrshl.s16 q1, q1, q9
1830 vqmovun.s16 d2, q1
1831 vst1.32 {d2[0]},[r4,:32], r1
1832 vst1.32 {d2[1]},[r4,:32], r1
1833 pop {r4, pc}
1834 .endm
1835
1836 .macro weight_func w
1837 function weight_h264_pixels_\w\()_neon
1838 push {r4, lr}
1839 ldr r4, [sp, #8]
1840 cmp r2, #1
1841 lsl r4, r4, r2
1842 vdup.16 q8, r4
1843 mov r4, r0
1844 ble 20f
1845 rsb lr, r2, #1
1846 vdup.16 q9, lr
1847 cmp r3, #0
1848 blt 10f
1849 weight_\w vhadd.s16
1850 10: rsb r3, r3, #0
1851 weight_\w vhsub.s16
1852 20: rsb lr, r2, #0
1853 vdup.16 q9, lr
1854 cmp r3, #0
1855 blt 10f
1856 weight_\w vadd.s16
1857 10: rsb r3, r3, #0
1858 weight_\w vsub.s16
1859 endfunc
1860 .endm
1861
1862 .macro weight_entry w, h, b=1
1863 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1864 mov ip, #\h
1865 .if \b
1866 b weight_h264_pixels_\w\()_neon
1867 .endif
1868 endfunc
1869 .endm
1870
1871 weight_entry 16, 8
1872 weight_entry 16, 16, b=0
1873 weight_func 16
1874
1875 weight_entry 8, 16
1876 weight_entry 8, 4
1877 weight_entry 8, 8, b=0
1878 weight_func 8
1879
1880 weight_entry 4, 8
1881 weight_entry 4, 2
1882 weight_entry 4, 4, b=0
1883 weight_func 4