ARM: set size of asm functions in object files
[libav.git] / libavcodec / arm / h264idct_neon.S
1 /*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "asm.S"
22
23 preserve8
24 .text
25
26 function ff_h264_idct_add_neon, export=1
27 vld1.64 {d0-d3}, [r1,:128]
28
29 vswp d1, d2
30 vadd.i16 d4, d0, d1
31 vshr.s16 q8, q1, #1
32 vsub.i16 d5, d0, d1
33 vadd.i16 d6, d2, d17
34 vsub.i16 d7, d16, d3
35 vadd.i16 q0, q2, q3
36 vsub.i16 q1, q2, q3
37
38 vtrn.16 d0, d1
39 vtrn.16 d3, d2
40 vtrn.32 d0, d3
41 vtrn.32 d1, d2
42
43 vadd.i16 d4, d0, d3
44 vld1.32 {d18[0]}, [r0,:32], r2
45 vswp d1, d3
46 vshr.s16 q8, q1, #1
47 vld1.32 {d19[1]}, [r0,:32], r2
48 vsub.i16 d5, d0, d1
49 vld1.32 {d18[1]}, [r0,:32], r2
50 vadd.i16 d6, d16, d3
51 vld1.32 {d19[0]}, [r0,:32], r2
52 vsub.i16 d7, d2, d17
53 sub r0, r0, r2, lsl #2
54 vadd.i16 q0, q2, q3
55 vsub.i16 q1, q2, q3
56
57 vrshr.s16 q0, q0, #6
58 vrshr.s16 q1, q1, #6
59
60 vaddw.u8 q0, q0, d18
61 vaddw.u8 q1, q1, d19
62
63 vqmovun.s16 d0, q0
64 vqmovun.s16 d1, q1
65
66 vst1.32 {d0[0]}, [r0,:32], r2
67 vst1.32 {d1[1]}, [r0,:32], r2
68 vst1.32 {d0[1]}, [r0,:32], r2
69 vst1.32 {d1[0]}, [r0,:32], r2
70
71 bx lr
72 endfunc
73
74 function ff_h264_idct_dc_add_neon, export=1
75 vld1.16 {d2[],d3[]}, [r1,:16]
76 vrshr.s16 q1, q1, #6
77 vld1.32 {d0[0]}, [r0,:32], r2
78 vld1.32 {d0[1]}, [r0,:32], r2
79 vaddw.u8 q2, q1, d0
80 vld1.32 {d1[0]}, [r0,:32], r2
81 vld1.32 {d1[1]}, [r0,:32], r2
82 vaddw.u8 q1, q1, d1
83 vqmovun.s16 d0, q2
84 vqmovun.s16 d1, q1
85 sub r0, r0, r2, lsl #2
86 vst1.32 {d0[0]}, [r0,:32], r2
87 vst1.32 {d0[1]}, [r0,:32], r2
88 vst1.32 {d1[0]}, [r0,:32], r2
89 vst1.32 {d1[1]}, [r0,:32], r2
90 bx lr
91 endfunc
92
93 function ff_h264_idct_add16_neon, export=1
94 push {r4-r8,lr}
95 mov r4, r0
96 mov r5, r1
97 mov r1, r2
98 mov r2, r3
99 ldr r6, [sp, #24]
100 movrel r7, scan8
101 mov ip, #16
102 1: ldrb r8, [r7], #1
103 ldr r0, [r5], #4
104 ldrb r8, [r6, r8]
105 subs r8, r8, #1
106 blt 2f
107 ldrsh lr, [r1]
108 add r0, r0, r4
109 movne lr, #0
110 cmp lr, #0
111 adrne lr, ff_h264_idct_dc_add_neon
112 adreq lr, ff_h264_idct_add_neon
113 blx lr
114 2: subs ip, ip, #1
115 add r1, r1, #32
116 bne 1b
117 pop {r4-r8,pc}
118 endfunc
119
120 function ff_h264_idct_add16intra_neon, export=1
121 push {r4-r8,lr}
122 mov r4, r0
123 mov r5, r1
124 mov r1, r2
125 mov r2, r3
126 ldr r6, [sp, #24]
127 movrel r7, scan8
128 mov ip, #16
129 1: ldrb r8, [r7], #1
130 ldr r0, [r5], #4
131 ldrb r8, [r6, r8]
132 add r0, r0, r4
133 cmp r8, #0
134 ldrsh r8, [r1]
135 adrne lr, ff_h264_idct_add_neon
136 adreq lr, ff_h264_idct_dc_add_neon
137 cmpeq r8, #0
138 blxne lr
139 subs ip, ip, #1
140 add r1, r1, #32
141 bne 1b
142 pop {r4-r8,pc}
143 endfunc
144
145 function ff_h264_idct_add8_neon, export=1
146 push {r4-r10,lr}
147 ldm r0, {r4,r9}
148 add r5, r1, #16*4
149 add r1, r2, #16*32
150 mov r2, r3
151 ldr r6, [sp, #32]
152 movrel r7, scan8+16
153 mov ip, #8
154 1: ldrb r8, [r7], #1
155 ldr r0, [r5], #4
156 ldrb r8, [r6, r8]
157 tst ip, #4
158 addeq r0, r0, r4
159 addne r0, r0, r9
160 cmp r8, #0
161 ldrsh r8, [r1]
162 adrne lr, ff_h264_idct_add_neon
163 adreq lr, ff_h264_idct_dc_add_neon
164 cmpeq r8, #0
165 blxne lr
166 subs ip, ip, #1
167 add r1, r1, #32
168 bne 1b
169 pop {r4-r10,pc}
170 endfunc
171
172 .section .rodata
173 scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8
174 .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8
175 .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8
176 .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8
177 .byte 1+1*8, 2+1*8
178 .byte 1+2*8, 2+2*8
179 .byte 1+4*8, 2+4*8
180 .byte 1+5*8, 2+5*8