Better ARM support for mplayer/ffmpeg, ported from atty fork
[libav.git] / libavcodec / armv4l / dsputil_arm.c
1 /*
2 * ARMv4L optimized DSP utils
3 * Copyright (c) 2001 Lionel Ulmer.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 #include "../dsputil.h"
21 #ifdef HAVE_IPP
22 #include "ipp.h"
23 #endif
24
25 #ifdef HAVE_IWMMXT
26 extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
27 #endif
28
29 extern void j_rev_dct_ARM(DCTELEM *data);
30 extern void simple_idct_ARM(DCTELEM *data);
31
32 /* XXX: local hack */
33 static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
34 static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
35
36 void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
37 void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
38 void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
39 void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
40
41 void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
42 void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
43 void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
44
45 void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
46 static void put_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
47 {
48 put_pixels8_x2_arm(block, pixels, line_size, h);
49 put_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
50 }
51
52 static void put_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
53 {
54 put_pixels8_y2_arm(block, pixels, line_size, h);
55 put_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
56 }
57
58 static void put_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
59 {
60 put_pixels8_xy2_arm(block, pixels, line_size, h);
61 put_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
62 }
63
64 static void put_no_rnd_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
65 {
66 put_no_rnd_pixels8_x2_arm(block, pixels, line_size, h);
67 put_no_rnd_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
68 }
69
70 static void put_no_rnd_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
71 {
72 put_no_rnd_pixels8_y2_arm(block, pixels, line_size, h);
73 put_no_rnd_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
74 }
75
76 static void put_no_rnd_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
77 {
78 put_no_rnd_pixels8_xy2_arm(block, pixels, line_size, h);
79 put_no_rnd_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
80 }
81
82 static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size)
83 {
84 asm volatile (
85 "mov r10, #8 \n\t"
86
87 "1: \n\t"
88
89 /* load dest */
90 "ldr r4, [%1] \n\t"
91 /* block[0] and block[1]*/
92 "ldrsh r5, [%0] \n\t"
93 "ldrsh r7, [%0, #2] \n\t"
94 "and r6, r4, #0xFF \n\t"
95 "and r8, r4, #0xFF00 \n\t"
96 "add r6, r5, r6 \n\t"
97 "add r8, r7, r8, lsr #8 \n\t"
98 "mvn r5, r5 \n\t"
99 "mvn r7, r7 \n\t"
100 "tst r6, #0x100 \n\t"
101 "movne r6, r5, lsr #24 \n\t"
102 "tst r8, #0x100 \n\t"
103 "movne r8, r7, lsr #24 \n\t"
104 "mov r9, r6 \n\t"
105 "ldrsh r5, [%0, #4] \n\t" /* moved form [A] */
106 "orr r9, r9, r8, lsl #8 \n\t"
107 /* block[2] and block[3] */
108 /* [A] */
109 "ldrsh r7, [%0, #6] \n\t"
110 "and r6, r4, #0xFF0000 \n\t"
111 "and r8, r4, #0xFF000000 \n\t"
112 "add r6, r5, r6, lsr #16 \n\t"
113 "add r8, r7, r8, lsr #24 \n\t"
114 "mvn r5, r5 \n\t"
115 "mvn r7, r7 \n\t"
116 "tst r6, #0x100 \n\t"
117 "movne r6, r5, lsr #24 \n\t"
118 "tst r8, #0x100 \n\t"
119 "movne r8, r7, lsr #24 \n\t"
120 "orr r9, r9, r6, lsl #16 \n\t"
121 "ldr r4, [%1, #4] \n\t" /* moved form [B] */
122 "orr r9, r9, r8, lsl #24 \n\t"
123 /* store dest */
124 "ldrsh r5, [%0, #8] \n\t" /* moved form [C] */
125 "str r9, [%1] \n\t"
126
127 /* load dest */
128 /* [B] */
129 /* block[4] and block[5] */
130 /* [C] */
131 "ldrsh r7, [%0, #10] \n\t"
132 "and r6, r4, #0xFF \n\t"
133 "and r8, r4, #0xFF00 \n\t"
134 "add r6, r5, r6 \n\t"
135 "add r8, r7, r8, lsr #8 \n\t"
136 "mvn r5, r5 \n\t"
137 "mvn r7, r7 \n\t"
138 "tst r6, #0x100 \n\t"
139 "movne r6, r5, lsr #24 \n\t"
140 "tst r8, #0x100 \n\t"
141 "movne r8, r7, lsr #24 \n\t"
142 "mov r9, r6 \n\t"
143 "ldrsh r5, [%0, #12] \n\t" /* moved from [D] */
144 "orr r9, r9, r8, lsl #8 \n\t"
145 /* block[6] and block[7] */
146 /* [D] */
147 "ldrsh r7, [%0, #14] \n\t"
148 "and r6, r4, #0xFF0000 \n\t"
149 "and r8, r4, #0xFF000000 \n\t"
150 "add r6, r5, r6, lsr #16 \n\t"
151 "add r8, r7, r8, lsr #24 \n\t"
152 "mvn r5, r5 \n\t"
153 "mvn r7, r7 \n\t"
154 "tst r6, #0x100 \n\t"
155 "movne r6, r5, lsr #24 \n\t"
156 "tst r8, #0x100 \n\t"
157 "movne r8, r7, lsr #24 \n\t"
158 "orr r9, r9, r6, lsl #16 \n\t"
159 "add %0, %0, #16 \n\t" /* moved from [E] */
160 "orr r9, r9, r8, lsl #24 \n\t"
161 "subs r10, r10, #1 \n\t" /* moved from [F] */
162 /* store dest */
163 "str r9, [%1, #4] \n\t"
164
165 /* [E] */
166 /* [F] */
167 "add %1, %1, %2 \n\t"
168 "bne 1b \n\t"
169 :
170 : "r"(block),
171 "r"(dest),
172 "r"(line_size)
173 : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" );
174 }
175
176 /* XXX: those functions should be suppressed ASAP when all IDCTs are
177 converted */
178 static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
179 {
180 j_rev_dct_ARM (block);
181 ff_put_pixels_clamped(block, dest, line_size);
182 }
183 static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
184 {
185 j_rev_dct_ARM (block);
186 ff_add_pixels_clamped(block, dest, line_size);
187 }
188 static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
189 {
190 simple_idct_ARM (block);
191 ff_put_pixels_clamped(block, dest, line_size);
192 }
193 static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
194 {
195 simple_idct_ARM (block);
196 ff_add_pixels_clamped(block, dest, line_size);
197 }
198 static void simple_idct_ipp(DCTELEM *block)
199 {
200 #ifdef HAVE_IPP
201 ippiDCT8x8Inv_Video_16s_C1I(block);
202 #endif
203 }
204 static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
205 {
206 #ifdef HAVE_IPP
207 ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
208 #endif
209 }
210
211 #ifdef HAVE_IWMMXT
212 void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
213 #endif
214
215 static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
216 {
217 #ifdef HAVE_IPP
218 ippiDCT8x8Inv_Video_16s_C1I(block);
219 #ifdef HAVE_IWMMXT
220 add_pixels_clamped_iwmmxt(block, dest, line_size);
221 #else
222 add_pixels_clamped_ARM(block, dest, line_size);
223 #endif
224 #endif
225 }
226
227 void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
228 {
229 const int idct_algo= avctx->idct_algo;
230
231 ff_put_pixels_clamped = c->put_pixels_clamped;
232 ff_add_pixels_clamped = c->add_pixels_clamped;
233
234 #ifdef HAVE_IPP
235 if(idct_algo==FF_IDCT_ARM){
236 #else
237 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){
238 #endif
239 c->idct_put= j_rev_dct_ARM_put;
240 c->idct_add= j_rev_dct_ARM_add;
241 c->idct = j_rev_dct_ARM;
242 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
243 } else if (idct_algo==FF_IDCT_SIMPLEARM){
244 c->idct_put= simple_idct_ARM_put;
245 c->idct_add= simple_idct_ARM_add;
246 c->idct = simple_idct_ARM;
247 c->idct_permutation_type= FF_NO_IDCT_PERM;
248 #ifdef HAVE_IPP
249 } else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){
250 #else
251 } else if (idct_algo==FF_IDCT_IPP){
252 #endif
253 c->idct_put= simple_idct_ipp_put;
254 c->idct_add= simple_idct_ipp_add;
255 c->idct = simple_idct_ipp;
256 c->idct_permutation_type= FF_NO_IDCT_PERM;
257 }
258
259 /* c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG!
260 c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK!
261 c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK!
262 /* c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */
263 /* c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; // ?(»È¤ï¤ì¤Ê¤¤) */
264 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK
265 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK
266 /* c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */
267 c->put_pixels_tab[1][0] = put_pixels8_arm; //OK
268 c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK
269 /* c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */
270 /* c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */
271 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK
272 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK
273 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
274 /* c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */
275
276 #if 1
277 #ifdef HAVE_IWMMXT
278 dsputil_init_iwmmxt(c, avctx);
279 #endif
280 #endif
281 }