ARM: set size of asm functions in object files
[libav.git] / libavcodec / arm / simple_idct_armv5te.S
1 /*
2 * Simple IDCT
3 *
4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include "asm.S"
25
26 #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27 #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28 #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29 #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30 #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31 #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32 #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
33 #define ROW_SHIFT 11
34 #define COL_SHIFT 20
35
36 #define W13 (W1 | (W3 << 16))
37 #define W26 (W2 | (W6 << 16))
38 #define W57 (W5 | (W7 << 16))
39
40 .text
41 .align
42 w13: .long W13
43 w26: .long W26
44 w57: .long W57
45
46 function idct_row_armv5te
47 str lr, [sp, #-4]!
48
49 ldrd v1, [a1, #8]
50 ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */
51 orrs v1, v1, v2
52 cmpeq v1, a4
53 cmpeq v1, a3, lsr #16
54 beq row_dc_only
55
56 mov v1, #(1<<(ROW_SHIFT-1))
57 mov ip, #16384
58 sub ip, ip, #1 /* ip = W4 */
59 smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */
60 ldr ip, w26 /* ip = W2 | (W6 << 16) */
61 smultb a2, ip, a4
62 smulbb lr, ip, a4
63 add v2, v1, a2
64 sub v3, v1, a2
65 sub v4, v1, lr
66 add v1, v1, lr
67
68 ldr ip, w13 /* ip = W1 | (W3 << 16) */
69 ldr lr, w57 /* lr = W5 | (W7 << 16) */
70 smulbt v5, ip, a3
71 smultt v6, lr, a4
72 smlatt v5, ip, a4, v5
73 smultt a2, ip, a3
74 smulbt v7, lr, a3
75 sub v6, v6, a2
76 smulbt a2, ip, a4
77 smultt fp, lr, a3
78 sub v7, v7, a2
79 smulbt a2, lr, a4
80 ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */
81 sub fp, fp, a2
82
83 orrs a2, a3, a4
84 beq 1f
85
86 smlabt v5, lr, a3, v5
87 smlabt v6, ip, a3, v6
88 smlatt v5, lr, a4, v5
89 smlabt v6, lr, a4, v6
90 smlatt v7, lr, a3, v7
91 smlatt fp, ip, a3, fp
92 smulbt a2, ip, a4
93 smlatt v7, ip, a4, v7
94 sub fp, fp, a2
95
96 ldr ip, w26 /* ip = W2 | (W6 << 16) */
97 mov a2, #16384
98 sub a2, a2, #1 /* a2 = W4 */
99 smulbb a2, a2, a3 /* a2 = W4*row[4] */
100 smultb lr, ip, a4 /* lr = W6*row[6] */
101 add v1, v1, a2 /* v1 += W4*row[4] */
102 add v1, v1, lr /* v1 += W6*row[6] */
103 add v4, v4, a2 /* v4 += W4*row[4] */
104 sub v4, v4, lr /* v4 -= W6*row[6] */
105 smulbb lr, ip, a4 /* lr = W2*row[6] */
106 sub v2, v2, a2 /* v2 -= W4*row[4] */
107 sub v2, v2, lr /* v2 -= W2*row[6] */
108 sub v3, v3, a2 /* v3 -= W4*row[4] */
109 add v3, v3, lr /* v3 += W2*row[6] */
110
111 1: add a2, v1, v5
112 mov a3, a2, lsr #11
113 bic a3, a3, #0x1f0000
114 sub a2, v2, v6
115 mov a2, a2, lsr #11
116 add a3, a3, a2, lsl #16
117 add a2, v3, v7
118 mov a4, a2, lsr #11
119 bic a4, a4, #0x1f0000
120 add a2, v4, fp
121 mov a2, a2, lsr #11
122 add a4, a4, a2, lsl #16
123 strd a3, [a1]
124
125 sub a2, v4, fp
126 mov a3, a2, lsr #11
127 bic a3, a3, #0x1f0000
128 sub a2, v3, v7
129 mov a2, a2, lsr #11
130 add a3, a3, a2, lsl #16
131 add a2, v2, v6
132 mov a4, a2, lsr #11
133 bic a4, a4, #0x1f0000
134 sub a2, v1, v5
135 mov a2, a2, lsr #11
136 add a4, a4, a2, lsl #16
137 strd a3, [a1, #8]
138
139 ldr pc, [sp], #4
140
141 row_dc_only:
142 orr a3, a3, a3, lsl #16
143 bic a3, a3, #0xe000
144 mov a3, a3, lsl #3
145 mov a4, a3
146 strd a3, [a1]
147 strd a3, [a1, #8]
148
149 ldr pc, [sp], #4
150 endfunc
151
152 .macro idct_col
153 ldr a4, [a1] /* a4 = col[1:0] */
154 mov ip, #16384
155 sub ip, ip, #1 /* ip = W4 */
156 #if 0
157 mov v1, #(1<<(COL_SHIFT-1))
158 smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
159 smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
160 ldr a4, [a1, #(16*4)]
161 #else
162 mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
163 add v2, v1, a4, asr #16
164 rsb v2, v2, v2, lsl #14
165 mov a4, a4, lsl #16
166 add v1, v1, a4, asr #16
167 ldr a4, [a1, #(16*4)]
168 rsb v1, v1, v1, lsl #14
169 #endif
170
171 smulbb lr, ip, a4
172 smulbt a3, ip, a4
173 sub v3, v1, lr
174 sub v5, v1, lr
175 add v7, v1, lr
176 add v1, v1, lr
177 sub v4, v2, a3
178 sub v6, v2, a3
179 add fp, v2, a3
180 ldr ip, w26
181 ldr a4, [a1, #(16*2)]
182 add v2, v2, a3
183
184 smulbb lr, ip, a4
185 smultb a3, ip, a4
186 add v1, v1, lr
187 sub v7, v7, lr
188 add v3, v3, a3
189 sub v5, v5, a3
190 smulbt lr, ip, a4
191 smultt a3, ip, a4
192 add v2, v2, lr
193 sub fp, fp, lr
194 add v4, v4, a3
195 ldr a4, [a1, #(16*6)]
196 sub v6, v6, a3
197
198 smultb lr, ip, a4
199 smulbb a3, ip, a4
200 add v1, v1, lr
201 sub v7, v7, lr
202 sub v3, v3, a3
203 add v5, v5, a3
204 smultt lr, ip, a4
205 smulbt a3, ip, a4
206 add v2, v2, lr
207 sub fp, fp, lr
208 sub v4, v4, a3
209 add v6, v6, a3
210
211 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
212
213 ldr ip, w13
214 ldr a4, [a1, #(16*1)]
215 ldr lr, w57
216 smulbb v1, ip, a4
217 smultb v3, ip, a4
218 smulbb v5, lr, a4
219 smultb v7, lr, a4
220 smulbt v2, ip, a4
221 smultt v4, ip, a4
222 smulbt v6, lr, a4
223 smultt fp, lr, a4
224 rsb v4, v4, #0
225 ldr a4, [a1, #(16*3)]
226 rsb v3, v3, #0
227
228 smlatb v1, ip, a4, v1
229 smlatb v3, lr, a4, v3
230 smulbb a3, ip, a4
231 smulbb a2, lr, a4
232 sub v5, v5, a3
233 sub v7, v7, a2
234 smlatt v2, ip, a4, v2
235 smlatt v4, lr, a4, v4
236 smulbt a3, ip, a4
237 smulbt a2, lr, a4
238 sub v6, v6, a3
239 ldr a4, [a1, #(16*5)]
240 sub fp, fp, a2
241
242 smlabb v1, lr, a4, v1
243 smlabb v3, ip, a4, v3
244 smlatb v5, lr, a4, v5
245 smlatb v7, ip, a4, v7
246 smlabt v2, lr, a4, v2
247 smlabt v4, ip, a4, v4
248 smlatt v6, lr, a4, v6
249 ldr a3, [a1, #(16*7)]
250 smlatt fp, ip, a4, fp
251
252 smlatb v1, lr, a3, v1
253 smlabb v3, lr, a3, v3
254 smlatb v5, ip, a3, v5
255 smulbb a4, ip, a3
256 smlatt v2, lr, a3, v2
257 sub v7, v7, a4
258 smlabt v4, lr, a3, v4
259 smulbt a4, ip, a3
260 smlatt v6, ip, a3, v6
261 sub fp, fp, a4
262 .endm
263
264 function idct_col_armv5te
265 str lr, [sp, #-4]!
266
267 idct_col
268
269 ldmfd sp!, {a3, a4}
270 adds a2, a3, v1
271 mov a2, a2, lsr #20
272 orrmi a2, a2, #0xf000
273 add ip, a4, v2
274 mov ip, ip, asr #20
275 orr a2, a2, ip, lsl #16
276 str a2, [a1]
277 subs a3, a3, v1
278 mov a2, a3, lsr #20
279 orrmi a2, a2, #0xf000
280 sub a4, a4, v2
281 mov a4, a4, asr #20
282 orr a2, a2, a4, lsl #16
283 ldmfd sp!, {a3, a4}
284 str a2, [a1, #(16*7)]
285
286 subs a2, a3, v3
287 mov a2, a2, lsr #20
288 orrmi a2, a2, #0xf000
289 sub ip, a4, v4
290 mov ip, ip, asr #20
291 orr a2, a2, ip, lsl #16
292 str a2, [a1, #(16*1)]
293 adds a3, a3, v3
294 mov a2, a3, lsr #20
295 orrmi a2, a2, #0xf000
296 add a4, a4, v4
297 mov a4, a4, asr #20
298 orr a2, a2, a4, lsl #16
299 ldmfd sp!, {a3, a4}
300 str a2, [a1, #(16*6)]
301
302 adds a2, a3, v5
303 mov a2, a2, lsr #20
304 orrmi a2, a2, #0xf000
305 add ip, a4, v6
306 mov ip, ip, asr #20
307 orr a2, a2, ip, lsl #16
308 str a2, [a1, #(16*2)]
309 subs a3, a3, v5
310 mov a2, a3, lsr #20
311 orrmi a2, a2, #0xf000
312 sub a4, a4, v6
313 mov a4, a4, asr #20
314 orr a2, a2, a4, lsl #16
315 ldmfd sp!, {a3, a4}
316 str a2, [a1, #(16*5)]
317
318 adds a2, a3, v7
319 mov a2, a2, lsr #20
320 orrmi a2, a2, #0xf000
321 add ip, a4, fp
322 mov ip, ip, asr #20
323 orr a2, a2, ip, lsl #16
324 str a2, [a1, #(16*3)]
325 subs a3, a3, v7
326 mov a2, a3, lsr #20
327 orrmi a2, a2, #0xf000
328 sub a4, a4, fp
329 mov a4, a4, asr #20
330 orr a2, a2, a4, lsl #16
331 str a2, [a1, #(16*4)]
332
333 ldr pc, [sp], #4
334 endfunc
335
336 function idct_col_put_armv5te
337 str lr, [sp, #-4]!
338
339 idct_col
340
341 ldmfd sp!, {a3, a4}
342 ldr lr, [sp, #32]
343 add a2, a3, v1
344 movs a2, a2, asr #20
345 movmi a2, #0
346 cmp a2, #255
347 movgt a2, #255
348 add ip, a4, v2
349 movs ip, ip, asr #20
350 movmi ip, #0
351 cmp ip, #255
352 movgt ip, #255
353 orr a2, a2, ip, lsl #8
354 sub a3, a3, v1
355 movs a3, a3, asr #20
356 movmi a3, #0
357 cmp a3, #255
358 movgt a3, #255
359 sub a4, a4, v2
360 movs a4, a4, asr #20
361 movmi a4, #0
362 cmp a4, #255
363 ldr v1, [sp, #28]
364 movgt a4, #255
365 strh a2, [v1]
366 add a2, v1, #2
367 str a2, [sp, #28]
368 orr a2, a3, a4, lsl #8
369 rsb v2, lr, lr, lsl #3
370 ldmfd sp!, {a3, a4}
371 strh a2, [v2, v1]!
372
373 sub a2, a3, v3
374 movs a2, a2, asr #20
375 movmi a2, #0
376 cmp a2, #255
377 movgt a2, #255
378 sub ip, a4, v4
379 movs ip, ip, asr #20
380 movmi ip, #0
381 cmp ip, #255
382 movgt ip, #255
383 orr a2, a2, ip, lsl #8
384 strh a2, [v1, lr]!
385 add a3, a3, v3
386 movs a2, a3, asr #20
387 movmi a2, #0
388 cmp a2, #255
389 movgt a2, #255
390 add a4, a4, v4
391 movs a4, a4, asr #20
392 movmi a4, #0
393 cmp a4, #255
394 movgt a4, #255
395 orr a2, a2, a4, lsl #8
396 ldmfd sp!, {a3, a4}
397 strh a2, [v2, -lr]!
398
399 add a2, a3, v5
400 movs a2, a2, asr #20
401 movmi a2, #0
402 cmp a2, #255
403 movgt a2, #255
404 add ip, a4, v6
405 movs ip, ip, asr #20
406 movmi ip, #0
407 cmp ip, #255
408 movgt ip, #255
409 orr a2, a2, ip, lsl #8
410 strh a2, [v1, lr]!
411 sub a3, a3, v5
412 movs a2, a3, asr #20
413 movmi a2, #0
414 cmp a2, #255
415 movgt a2, #255
416 sub a4, a4, v6
417 movs a4, a4, asr #20
418 movmi a4, #0
419 cmp a4, #255
420 movgt a4, #255
421 orr a2, a2, a4, lsl #8
422 ldmfd sp!, {a3, a4}
423 strh a2, [v2, -lr]!
424
425 add a2, a3, v7
426 movs a2, a2, asr #20
427 movmi a2, #0
428 cmp a2, #255
429 movgt a2, #255
430 add ip, a4, fp
431 movs ip, ip, asr #20
432 movmi ip, #0
433 cmp ip, #255
434 movgt ip, #255
435 orr a2, a2, ip, lsl #8
436 strh a2, [v1, lr]
437 sub a3, a3, v7
438 movs a2, a3, asr #20
439 movmi a2, #0
440 cmp a2, #255
441 movgt a2, #255
442 sub a4, a4, fp
443 movs a4, a4, asr #20
444 movmi a4, #0
445 cmp a4, #255
446 movgt a4, #255
447 orr a2, a2, a4, lsl #8
448 strh a2, [v2, -lr]
449
450 ldr pc, [sp], #4
451 endfunc
452
453 function idct_col_add_armv5te
454 str lr, [sp, #-4]!
455
456 idct_col
457
458 ldr lr, [sp, #36]
459
460 ldmfd sp!, {a3, a4}
461 ldrh ip, [lr]
462 add a2, a3, v1
463 mov a2, a2, asr #20
464 sub a3, a3, v1
465 and v1, ip, #255
466 adds a2, a2, v1
467 movmi a2, #0
468 cmp a2, #255
469 movgt a2, #255
470 add v1, a4, v2
471 mov v1, v1, asr #20
472 adds v1, v1, ip, lsr #8
473 movmi v1, #0
474 cmp v1, #255
475 movgt v1, #255
476 orr a2, a2, v1, lsl #8
477 ldr v1, [sp, #32]
478 sub a4, a4, v2
479 rsb v2, v1, v1, lsl #3
480 ldrh ip, [v2, lr]!
481 strh a2, [lr]
482 mov a3, a3, asr #20
483 and a2, ip, #255
484 adds a3, a3, a2
485 movmi a3, #0
486 cmp a3, #255
487 movgt a3, #255
488 mov a4, a4, asr #20
489 adds a4, a4, ip, lsr #8
490 movmi a4, #0
491 cmp a4, #255
492 movgt a4, #255
493 add a2, lr, #2
494 str a2, [sp, #28]
495 orr a2, a3, a4, lsl #8
496 strh a2, [v2]
497
498 ldmfd sp!, {a3, a4}
499 ldrh ip, [lr, v1]!
500 sub a2, a3, v3
501 mov a2, a2, asr #20
502 add a3, a3, v3
503 and v3, ip, #255
504 adds a2, a2, v3
505 movmi a2, #0
506 cmp a2, #255
507 movgt a2, #255
508 sub v3, a4, v4
509 mov v3, v3, asr #20
510 adds v3, v3, ip, lsr #8
511 movmi v3, #0
512 cmp v3, #255
513 movgt v3, #255
514 orr a2, a2, v3, lsl #8
515 add a4, a4, v4
516 ldrh ip, [v2, -v1]!
517 strh a2, [lr]
518 mov a3, a3, asr #20
519 and a2, ip, #255
520 adds a3, a3, a2
521 movmi a3, #0
522 cmp a3, #255
523 movgt a3, #255
524 mov a4, a4, asr #20
525 adds a4, a4, ip, lsr #8
526 movmi a4, #0
527 cmp a4, #255
528 movgt a4, #255
529 orr a2, a3, a4, lsl #8
530 strh a2, [v2]
531
532 ldmfd sp!, {a3, a4}
533 ldrh ip, [lr, v1]!
534 add a2, a3, v5
535 mov a2, a2, asr #20
536 sub a3, a3, v5
537 and v3, ip, #255
538 adds a2, a2, v3
539 movmi a2, #0
540 cmp a2, #255
541 movgt a2, #255
542 add v3, a4, v6
543 mov v3, v3, asr #20
544 adds v3, v3, ip, lsr #8
545 movmi v3, #0
546 cmp v3, #255
547 movgt v3, #255
548 orr a2, a2, v3, lsl #8
549 sub a4, a4, v6
550 ldrh ip, [v2, -v1]!
551 strh a2, [lr]
552 mov a3, a3, asr #20
553 and a2, ip, #255
554 adds a3, a3, a2
555 movmi a3, #0
556 cmp a3, #255
557 movgt a3, #255
558 mov a4, a4, asr #20
559 adds a4, a4, ip, lsr #8
560 movmi a4, #0
561 cmp a4, #255
562 movgt a4, #255
563 orr a2, a3, a4, lsl #8
564 strh a2, [v2]
565
566 ldmfd sp!, {a3, a4}
567 ldrh ip, [lr, v1]!
568 add a2, a3, v7
569 mov a2, a2, asr #20
570 sub a3, a3, v7
571 and v3, ip, #255
572 adds a2, a2, v3
573 movmi a2, #0
574 cmp a2, #255
575 movgt a2, #255
576 add v3, a4, fp
577 mov v3, v3, asr #20
578 adds v3, v3, ip, lsr #8
579 movmi v3, #0
580 cmp v3, #255
581 movgt v3, #255
582 orr a2, a2, v3, lsl #8
583 sub a4, a4, fp
584 ldrh ip, [v2, -v1]!
585 strh a2, [lr]
586 mov a3, a3, asr #20
587 and a2, ip, #255
588 adds a3, a3, a2
589 movmi a3, #0
590 cmp a3, #255
591 movgt a3, #255
592 mov a4, a4, asr #20
593 adds a4, a4, ip, lsr #8
594 movmi a4, #0
595 cmp a4, #255
596 movgt a4, #255
597 orr a2, a3, a4, lsl #8
598 strh a2, [v2]
599
600 ldr pc, [sp], #4
601 endfunc
602
603 function ff_simple_idct_armv5te, export=1
604 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
605
606 bl idct_row_armv5te
607 add a1, a1, #16
608 bl idct_row_armv5te
609 add a1, a1, #16
610 bl idct_row_armv5te
611 add a1, a1, #16
612 bl idct_row_armv5te
613 add a1, a1, #16
614 bl idct_row_armv5te
615 add a1, a1, #16
616 bl idct_row_armv5te
617 add a1, a1, #16
618 bl idct_row_armv5te
619 add a1, a1, #16
620 bl idct_row_armv5te
621
622 sub a1, a1, #(16*7)
623
624 bl idct_col_armv5te
625 add a1, a1, #4
626 bl idct_col_armv5te
627 add a1, a1, #4
628 bl idct_col_armv5te
629 add a1, a1, #4
630 bl idct_col_armv5te
631
632 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
633 endfunc
634
635 function ff_simple_idct_add_armv5te, export=1
636 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
637
638 mov a1, a3
639
640 bl idct_row_armv5te
641 add a1, a1, #16
642 bl idct_row_armv5te
643 add a1, a1, #16
644 bl idct_row_armv5te
645 add a1, a1, #16
646 bl idct_row_armv5te
647 add a1, a1, #16
648 bl idct_row_armv5te
649 add a1, a1, #16
650 bl idct_row_armv5te
651 add a1, a1, #16
652 bl idct_row_armv5te
653 add a1, a1, #16
654 bl idct_row_armv5te
655
656 sub a1, a1, #(16*7)
657
658 bl idct_col_add_armv5te
659 add a1, a1, #4
660 bl idct_col_add_armv5te
661 add a1, a1, #4
662 bl idct_col_add_armv5te
663 add a1, a1, #4
664 bl idct_col_add_armv5te
665
666 add sp, sp, #8
667 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
668 endfunc
669
670 function ff_simple_idct_put_armv5te, export=1
671 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
672
673 mov a1, a3
674
675 bl idct_row_armv5te
676 add a1, a1, #16
677 bl idct_row_armv5te
678 add a1, a1, #16
679 bl idct_row_armv5te
680 add a1, a1, #16
681 bl idct_row_armv5te
682 add a1, a1, #16
683 bl idct_row_armv5te
684 add a1, a1, #16
685 bl idct_row_armv5te
686 add a1, a1, #16
687 bl idct_row_armv5te
688 add a1, a1, #16
689 bl idct_row_armv5te
690
691 sub a1, a1, #(16*7)
692
693 bl idct_col_put_armv5te
694 add a1, a1, #4
695 bl idct_col_put_armv5te
696 add a1, a1, #4
697 bl idct_col_put_armv5te
698 add a1, a1, #4
699 bl idct_col_put_armv5te
700
701 add sp, sp, #8
702 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
703 endfunc