ARM: work around linker bug with movw/movt relocations in shared libs
[libav.git] / libavcodec / arm / h264idct_neon.S
1 /*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "asm.S"
22
23 preserve8
24 .fpu neon
25
26 .text
27
28 function ff_h264_idct_add_neon, export=1
29 mov r3, #(1<<5)
30 vmov.i16 d16, #0
31 vmov.16 d16[0], r3
32 vld1.64 {d0-d3}, [r1,:128]
33 vadd.i16 d0, d0, d16
34
35 vswp d1, d2
36 vadd.i16 d4, d0, d1
37 vshr.s16 q8, q1, #1
38 vsub.i16 d5, d0, d1
39 vadd.i16 d6, d2, d17
40 vsub.i16 d7, d16, d3
41 vadd.i16 q0, q2, q3
42 vsub.i16 q1, q2, q3
43
44 vtrn.16 d0, d1
45 vtrn.16 d3, d2
46 vtrn.32 d0, d3
47 vtrn.32 d1, d2
48
49 vadd.i16 d4, d0, d3
50 vld1.32 {d18[0]}, [r0,:32], r2
51 vswp d1, d3
52 vshr.s16 q8, q1, #1
53 vld1.32 {d19[1]}, [r0,:32], r2
54 vsub.i16 d5, d0, d1
55 vld1.32 {d18[1]}, [r0,:32], r2
56 vadd.i16 d6, d16, d3
57 vld1.32 {d19[0]}, [r0,:32], r2
58 vsub.i16 d7, d2, d17
59 sub r0, r0, r2, lsl #2
60 vadd.i16 q0, q2, q3
61 vsub.i16 q1, q2, q3
62
63 vshr.s16 q0, q0, #6
64 vshr.s16 q1, q1, #6
65
66 vaddw.u8 q0, q0, d18
67 vaddw.u8 q1, q1, d19
68
69 vqmovun.s16 d0, q0
70 vqmovun.s16 d1, q1
71
72 vst1.32 {d0[0]}, [r0,:32], r2
73 vst1.32 {d1[1]}, [r0,:32], r2
74 vst1.32 {d0[1]}, [r0,:32], r2
75 vst1.32 {d1[0]}, [r0,:32], r2
76
77 bx lr
78 .endfunc
79
80 function ff_h264_idct_dc_add_neon, export=1
81 vld1.16 {d2[],d3[]}, [r1,:16]
82 vrshr.s16 q1, q1, #6
83 vld1.32 {d0[0]}, [r0,:32], r2
84 vld1.32 {d0[1]}, [r0,:32], r2
85 vaddw.u8 q2, q1, d0
86 vld1.32 {d1[0]}, [r0,:32], r2
87 vld1.32 {d1[1]}, [r0,:32], r2
88 vaddw.u8 q1, q1, d1
89 vqmovun.s16 d0, q2
90 vqmovun.s16 d1, q1
91 sub r0, r0, r2, lsl #2
92 vst1.32 {d0[0]}, [r0,:32], r2
93 vst1.32 {d0[1]}, [r0,:32], r2
94 vst1.32 {d1[0]}, [r0,:32], r2
95 vst1.32 {d1[1]}, [r0,:32], r2
96 bx lr
97 .endfunc
98
99 function ff_h264_idct_add16_neon, export=1
100 push {r4-r8,lr}
101 mov r4, r0
102 mov r5, r1
103 mov r1, r2
104 mov r2, r3
105 ldr r6, [sp, #24]
106 movrel r7, scan8
107 mov ip, #16
108 1: ldrb r8, [r7], #1
109 ldr r0, [r5], #4
110 ldrb r8, [r6, r8]
111 subs r8, r8, #1
112 blt 2f
113 ldrsh lr, [r1]
114 add r0, r0, r4
115 movne lr, #0
116 cmp lr, #0
117 adrne lr, ff_h264_idct_dc_add_neon
118 adreq lr, ff_h264_idct_add_neon
119 blx lr
120 2: subs ip, ip, #1
121 add r1, r1, #32
122 bne 1b
123 pop {r4-r8,pc}
124 .endfunc
125
126 function ff_h264_idct_add16intra_neon, export=1
127 push {r4-r8,lr}
128 mov r4, r0
129 mov r5, r1
130 mov r1, r2
131 mov r2, r3
132 ldr r6, [sp, #24]
133 movrel r7, scan8
134 mov ip, #16
135 1: ldrb r8, [r7], #1
136 ldr r0, [r5], #4
137 ldrb r8, [r6, r8]
138 add r0, r0, r4
139 cmp r8, #0
140 ldrsh r8, [r1]
141 adrne lr, ff_h264_idct_add_neon
142 adreq lr, ff_h264_idct_dc_add_neon
143 cmpeq r8, #0
144 blxne lr
145 subs ip, ip, #1
146 add r1, r1, #32
147 bne 1b
148 pop {r4-r8,pc}
149 .endfunc
150
151 function ff_h264_idct_add8_neon, export=1
152 push {r4-r10,lr}
153 ldm r0, {r4,r9}
154 add r5, r1, #16*4
155 add r1, r2, #16*32
156 mov r2, r3
157 ldr r6, [sp, #32]
158 movrel r7, scan8+16
159 mov ip, #8
160 1: ldrb r8, [r7], #1
161 ldr r0, [r5], #4
162 ldrb r8, [r6, r8]
163 tst ip, #4
164 addeq r0, r0, r4
165 addne r0, r0, r9
166 cmp r8, #0
167 ldrsh r8, [r1]
168 adrne lr, ff_h264_idct_add_neon
169 adreq lr, ff_h264_idct_dc_add_neon
170 cmpeq r8, #0
171 blxne lr
172 subs ip, ip, #1
173 add r1, r1, #32
174 bne 1b
175 pop {r4-r10,pc}
176 .endfunc
177
178 .section .rodata
179 scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8
180 .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8
181 .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8
182 .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8
183 .byte 1+1*8, 2+1*8
184 .byte 1+2*8, 2+2*8
185 .byte 1+4*8, 2+4*8
186 .byte 1+5*8, 2+5*8