7f1c8eb8d0ff526ea312810ff943dc86bb2648b3
[libav.git] / libavcodec / arm / h264idct_neon.S
1 /*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "asm.S"
22
23 preserve8
24 .fpu neon
25
26 .text
27
28 function ff_h264_idct_add_neon, export=1
29 mov r3, #(1<<5)
30 vmov.i16 d16, #0
31 vmov.16 d16[0], r3
32 vld1.64 {d0-d3}, [r1,:128]
33 vadd.i16 d0, d0, d16
34
35 vswp d1, d2
36 vadd.i16 d4, d0, d1
37 vshr.s16 q8, q1, #1
38 vsub.i16 d5, d0, d1
39 vadd.i16 d6, d2, d17
40 vsub.i16 d7, d16, d3
41 vadd.i16 q0, q2, q3
42 vsub.i16 q1, q2, q3
43
44 vtrn.16 d0, d1
45 vtrn.16 d3, d2
46 vtrn.32 d0, d3
47 vtrn.32 d1, d2
48
49 vadd.i16 d4, d0, d3
50 vld1.32 {d18[0]}, [r0,:32], r2
51 vswp d1, d3
52 vshr.s16 q8, q1, #1
53 vld1.32 {d19[1]}, [r0,:32], r2
54 vsub.i16 d5, d0, d1
55 vld1.32 {d18[1]}, [r0,:32], r2
56 vadd.i16 d6, d16, d3
57 vld1.32 {d19[0]}, [r0,:32], r2
58 vsub.i16 d7, d2, d17
59 sub r0, r0, r2, lsl #2
60 vadd.i16 q0, q2, q3
61 vsub.i16 q1, q2, q3
62
63 vshr.s16 q0, q0, #6
64 vshr.s16 q1, q1, #6
65
66 vaddw.u8 q0, q0, d18
67 vaddw.u8 q1, q1, d19
68
69 vqmovun.s16 d0, q0
70 vqmovun.s16 d1, q1
71
72 vst1.32 {d0[0]}, [r0,:32], r2
73 vst1.32 {d1[1]}, [r0,:32], r2
74 vst1.32 {d0[1]}, [r0,:32], r2
75 vst1.32 {d1[0]}, [r0,:32], r2
76
77 bx lr
78 .endfunc
79
80 function ff_h264_idct_dc_add_neon, export=1
81 vld1.16 {d2[],d3[]}, [r1,:16]
82 vrshr.s16 q1, q1, #6
83 vld1.32 {d0[0]}, [r0,:32], r2
84 vld1.32 {d0[1]}, [r0,:32], r2
85 vaddw.u8 q2, q1, d0
86 vld1.32 {d1[0]}, [r0,:32], r2
87 vld1.32 {d1[1]}, [r0,:32], r2
88 vaddw.u8 q1, q1, d1
89 vqmovun.s16 d0, q2
90 vqmovun.s16 d1, q1
91 sub r0, r0, r2, lsl #2
92 vst1.32 {d0[0]}, [r0,:32], r2
93 vst1.32 {d0[1]}, [r0,:32], r2
94 vst1.32 {d1[0]}, [r0,:32], r2
95 vst1.32 {d1[1]}, [r0,:32], r2
96 bx lr
97 .endfunc
98
99 function ff_h264_idct_add16_neon, export=1
100 push {r4-r8,lr}
101 mov r4, r0
102 mov r5, r1
103 mov r1, r2
104 mov r2, r3
105 ldr r6, [sp, #24]
106 movw r7, #:lower16:scan8
107 movt r7, #:upper16:scan8
108 mov ip, #16
109 1: ldrb r8, [r7], #1
110 ldr r0, [r5], #4
111 ldrb r8, [r6, r8]
112 subs r8, r8, #1
113 blt 2f
114 ldrsh lr, [r1]
115 add r0, r0, r4
116 movne lr, #0
117 cmp lr, #0
118 adrne lr, ff_h264_idct_dc_add_neon
119 adreq lr, ff_h264_idct_add_neon
120 blx lr
121 2: subs ip, ip, #1
122 add r1, r1, #32
123 bne 1b
124 pop {r4-r8,pc}
125 .endfunc
126
127 function ff_h264_idct_add16intra_neon, export=1
128 push {r4-r8,lr}
129 mov r4, r0
130 mov r5, r1
131 mov r1, r2
132 mov r2, r3
133 ldr r6, [sp, #24]
134 movw r7, #:lower16:scan8
135 movt r7, #:upper16:scan8
136 mov ip, #16
137 1: ldrb r8, [r7], #1
138 ldr r0, [r5], #4
139 ldrb r8, [r6, r8]
140 add r0, r0, r4
141 cmp r8, #0
142 ldrsh r8, [r1]
143 adrne lr, ff_h264_idct_add_neon
144 adreq lr, ff_h264_idct_dc_add_neon
145 cmpeq r8, #0
146 blxne lr
147 subs ip, ip, #1
148 add r1, r1, #32
149 bne 1b
150 pop {r4-r8,pc}
151 .endfunc
152
153 function ff_h264_idct_add8_neon, export=1
154 push {r4-r10,lr}
155 ldm r0, {r4,r9}
156 add r5, r1, #16*4
157 add r1, r2, #16*32
158 mov r2, r3
159 ldr r6, [sp, #32]
160 movw r7, #:lower16:scan8+16
161 movt r7, #:upper16:scan8+16
162 mov ip, #8
163 1: ldrb r8, [r7], #1
164 ldr r0, [r5], #4
165 ldrb r8, [r6, r8]
166 tst ip, #4
167 addeq r0, r0, r4
168 addne r0, r0, r9
169 cmp r8, #0
170 ldrsh r8, [r1]
171 adrne lr, ff_h264_idct_add_neon
172 adreq lr, ff_h264_idct_dc_add_neon
173 cmpeq r8, #0
174 blxne lr
175 subs ip, ip, #1
176 add r1, r1, #32
177 bne 1b
178 pop {r4-r10,pc}
179 .endfunc
180
181 .section .rodata
182 scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8
183 .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8
184 .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8
185 .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8
186 .byte 1+1*8, 2+1*8
187 .byte 1+2*8, 2+2*8
188 .byte 1+4*8, 2+4*8
189 .byte 1+5*8, 2+5*8