bc7efc331c22bf538b81079ba8a16292c505dd34
[libav.git] / libavcodec / arm / dsputil_armv6.S
1 /*
2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "asm.S"
22
23 preserve8
24
25 .text
26
27 .macro call_2x_pixels type, subp
28 function ff_\type\()_pixels16\subp\()_armv6, export=1
29 push {r0-r3, lr}
30 bl ff_\type\()_pixels8\subp\()_armv6
31 pop {r0-r3, lr}
32 add r0, r0, #8
33 add r1, r1, #8
34 b ff_\type\()_pixels8\subp\()_armv6
35 .endfunc
36 .endm
37
38 call_2x_pixels avg
39 call_2x_pixels put, _x2
40 call_2x_pixels put, _y2
41 call_2x_pixels put, _x2_no_rnd
42 call_2x_pixels put, _y2_no_rnd
43
44 function ff_put_pixels16_armv6, export=1
45 push {r4-r11}
46 1:
47 ldr r5, [r1, #4]
48 ldr r6, [r1, #8]
49 ldr r7, [r1, #12]
50 ldr r4, [r1], r2
51 strd r6, r7, [r0, #8]
52 ldr r9, [r1, #4]
53 strd r4, r5, [r0], r2
54 ldr r10, [r1, #8]
55 ldr r11, [r1, #12]
56 ldr r8, [r1], r2
57 strd r10, r11, [r0, #8]
58 subs r3, r3, #2
59 strd r8, r9, [r0], r2
60 bne 1b
61
62 pop {r4-r11}
63 bx lr
64 .endfunc
65
66 function ff_put_pixels8_armv6, export=1
67 push {r4-r7}
68 1:
69 ldr r5, [r1, #4]
70 ldr r4, [r1], r2
71 ldr r7, [r1, #4]
72 strd r4, r5, [r0], r2
73 ldr r6, [r1], r2
74 subs r3, r3, #2
75 strd r6, r7, [r0], r2
76 bne 1b
77
78 pop {r4-r7}
79 bx lr
80 .endfunc
81
82 function ff_put_pixels8_x2_armv6, export=1
83 push {r4-r11, lr}
84 mov r12, #1
85 orr r12, r12, r12, lsl #8
86 orr r12, r12, r12, lsl #16
87 1:
88 ldr r4, [r1]
89 subs r3, r3, #2
90 ldr r5, [r1, #4]
91 ldr r7, [r1, #5]
92 lsr r6, r4, #8
93 ldr r8, [r1, r2]!
94 orr r6, r6, r5, lsl #24
95 ldr r9, [r1, #4]
96 ldr r11, [r1, #5]
97 lsr r10, r8, #8
98 add r1, r1, r2
99 orr r10, r10, r9, lsl #24
100 eor r14, r4, r6
101 uhadd8 r4, r4, r6
102 eor r6, r5, r7
103 uhadd8 r5, r5, r7
104 and r14, r14, r12
105 and r6, r6, r12
106 uadd8 r4, r4, r14
107 eor r14, r8, r10
108 uadd8 r5, r5, r6
109 eor r6, r9, r11
110 uhadd8 r8, r8, r10
111 and r14, r14, r12
112 uhadd8 r9, r9, r11
113 and r6, r6, r12
114 uadd8 r8, r8, r14
115 strd r4, r5, [r0], r2
116 uadd8 r9, r9, r6
117 strd r8, r9, [r0], r2
118 bne 1b
119
120 pop {r4-r11, pc}
121 .endfunc
122
123 function ff_put_pixels8_y2_armv6, export=1
124 push {r4-r11}
125 mov r12, #1
126 orr r12, r12, r12, lsl #8
127 orr r12, r12, r12, lsl #16
128 ldr r4, [r1]
129 ldr r5, [r1, #4]
130 ldr r6, [r1, r2]!
131 ldr r7, [r1, #4]
132 1:
133 subs r3, r3, #2
134 uhadd8 r8, r4, r6
135 eor r10, r4, r6
136 uhadd8 r9, r5, r7
137 eor r11, r5, r7
138 and r10, r10, r12
139 ldr r4, [r1, r2]!
140 uadd8 r8, r8, r10
141 and r11, r11, r12
142 uadd8 r9, r9, r11
143 ldr r5, [r1, #4]
144 uhadd8 r10, r4, r6
145 eor r6, r4, r6
146 uhadd8 r11, r5, r7
147 and r6, r6, r12
148 eor r7, r5, r7
149 uadd8 r10, r10, r6
150 and r7, r7, r12
151 ldr r6, [r1, r2]!
152 uadd8 r11, r11, r7
153 strd r8, r9, [r0], r2
154 ldr r7, [r1, #4]
155 strd r10, r11, [r0], r2
156 bne 1b
157
158 pop {r4-r11}
159 bx lr
160 .endfunc
161
162 function ff_put_pixels8_x2_no_rnd_armv6, export=1
163 push {r4-r9, lr}
164 1:
165 subs r3, r3, #2
166 ldr r4, [r1]
167 ldr r5, [r1, #4]
168 ldr r7, [r1, #5]
169 ldr r8, [r1, r2]!
170 ldr r9, [r1, #4]
171 ldr r14, [r1, #5]
172 add r1, r1, r2
173 lsr r6, r4, #8
174 orr r6, r6, r5, lsl #24
175 lsr r12, r8, #8
176 orr r12, r12, r9, lsl #24
177 uhadd8 r4, r4, r6
178 uhadd8 r5, r5, r7
179 uhadd8 r8, r8, r12
180 uhadd8 r9, r9, r14
181 stm r0, {r4,r5}
182 add r0, r0, r2
183 stm r0, {r8,r9}
184 add r0, r0, r2
185 bne 1b
186
187 pop {r4-r9, pc}
188 .endfunc
189
190 function ff_put_pixels8_y2_no_rnd_armv6, export=1
191 push {r4-r9, lr}
192 ldr r4, [r1]
193 ldr r5, [r1, #4]
194 ldr r6, [r1, r2]!
195 ldr r7, [r1, #4]
196 1:
197 subs r3, r3, #2
198 uhadd8 r8, r4, r6
199 ldr r4, [r1, r2]!
200 uhadd8 r9, r5, r7
201 ldr r5, [r1, #4]
202 uhadd8 r12, r4, r6
203 ldr r6, [r1, r2]!
204 uhadd8 r14, r5, r7
205 ldr r7, [r1, #4]
206 stm r0, {r8,r9}
207 add r0, r0, r2
208 stm r0, {r12,r14}
209 add r0, r0, r2
210 bne 1b
211
212 pop {r4-r9, pc}
213 .endfunc
214
215 function ff_avg_pixels8_armv6, export=1
216 pld [r1, r2]
217 push {r4-r10, lr}
218 mov lr, #1
219 orr lr, lr, lr, lsl #8
220 orr lr, lr, lr, lsl #16
221 ldrd r4, r5, [r0]
222 ldr r10, [r1, #4]
223 ldr r9, [r1], r2
224 subs r3, r3, #2
225 1:
226 pld [r1, r2]
227 eor r8, r4, r9
228 uhadd8 r4, r4, r9
229 eor r12, r5, r10
230 ldrd r6, r7, [r0, r2]
231 uhadd8 r5, r5, r10
232 and r8, r8, lr
233 ldr r10, [r1, #4]
234 and r12, r12, lr
235 uadd8 r4, r4, r8
236 ldr r9, [r1], r2
237 eor r8, r6, r9
238 uadd8 r5, r5, r12
239 pld [r1, r2, lsl #1]
240 eor r12, r7, r10
241 uhadd8 r6, r6, r9
242 strd r4, r5, [r0], r2
243 uhadd8 r7, r7, r10
244 beq 2f
245 and r8, r8, lr
246 ldrd r4, r5, [r0, r2]
247 uadd8 r6, r6, r8
248 ldr r10, [r1, #4]
249 and r12, r12, lr
250 subs r3, r3, #2
251 uadd8 r7, r7, r12
252 ldr r9, [r1], r2
253 strd r6, r7, [r0], r2
254 b 1b
255 2:
256 and r8, r8, lr
257 and r12, r12, lr
258 uadd8 r6, r6, r8
259 uadd8 r7, r7, r12
260 strd r6, r7, [r0], r2
261
262 pop {r4-r10, pc}
263 .endfunc
264
265 function ff_add_pixels_clamped_armv6, export=1
266 push {r4-r8,lr}
267 mov r3, #8
268 1:
269 ldm r0!, {r4,r5,r12,lr}
270 ldrd r6, r7, [r1]
271 pkhbt r8, r4, r5, lsl #16
272 pkhtb r5, r5, r4, asr #16
273 pkhbt r4, r12, lr, lsl #16
274 pkhtb lr, lr, r12, asr #16
275 pld [r1, r2]
276 uxtab16 r8, r8, r6
277 uxtab16 r5, r5, r6, ror #8
278 uxtab16 r4, r4, r7
279 uxtab16 lr, lr, r7, ror #8
280 usat16 r8, #8, r8
281 usat16 r5, #8, r5
282 usat16 r4, #8, r4
283 usat16 lr, #8, lr
284 orr r6, r8, r5, lsl #8
285 orr r7, r4, lr, lsl #8
286 subs r3, r3, #1
287 strd r6, r7, [r1], r2
288 bgt 1b
289 pop {r4-r8,pc}
290 .endfunc
291
292 function ff_get_pixels_armv6, export=1
293 pld [r1, r2]
294 push {r4-r8, lr}
295 mov lr, #8
296 1:
297 ldrd r4, r5, [r1], r2
298 subs lr, lr, #1
299 uxtb16 r6, r4
300 uxtb16 r4, r4, ror #8
301 uxtb16 r12, r5
302 uxtb16 r8, r5, ror #8
303 pld [r1, r2]
304 pkhbt r5, r6, r4, lsl #16
305 pkhtb r6, r4, r6, asr #16
306 pkhbt r7, r12, r8, lsl #16
307 pkhtb r12, r8, r12, asr #16
308 stm r0!, {r5,r6,r7,r12}
309 bgt 1b
310
311 pop {r4-r8, pc}
312 .endfunc
313
314 function ff_diff_pixels_armv6, export=1
315 pld [r1, r3]
316 pld [r2, r3]
317 push {r4-r9, lr}
318 mov lr, #8
319 1:
320 ldrd r4, r5, [r1], r3
321 ldrd r6, r7, [r2], r3
322 uxtb16 r8, r4
323 uxtb16 r4, r4, ror #8
324 uxtb16 r9, r6
325 uxtb16 r6, r6, ror #8
326 pld [r1, r3]
327 ssub16 r9, r8, r9
328 ssub16 r6, r4, r6
329 uxtb16 r8, r5
330 uxtb16 r5, r5, ror #8
331 pld [r2, r3]
332 pkhbt r4, r9, r6, lsl #16
333 pkhtb r6, r6, r9, asr #16
334 uxtb16 r9, r7
335 uxtb16 r7, r7, ror #8
336 ssub16 r9, r8, r9
337 ssub16 r5, r5, r7
338 subs lr, lr, #1
339 pkhbt r8, r9, r5, lsl #16
340 pkhtb r9, r5, r9, asr #16
341 stm r0!, {r4,r6,r8,r9}
342 bgt 1b
343
344 pop {r4-r9, pc}
345 .endfunc
346
347 function ff_pix_abs16_armv6, export=1
348 ldr r0, [sp]
349 push {r4-r9, lr}
350 mov r12, #0
351 mov lr, #0
352 ldm r1, {r4-r7}
353 ldr r8, [r2]
354 1:
355 ldr r9, [r2, #4]
356 pld [r1, r3]
357 usada8 r12, r4, r8, r12
358 ldr r8, [r2, #8]
359 pld [r2, r3]
360 usada8 lr, r5, r9, lr
361 ldr r9, [r2, #12]
362 usada8 r12, r6, r8, r12
363 subs r0, r0, #1
364 usada8 lr, r7, r9, lr
365 beq 2f
366 add r1, r1, r3
367 ldm r1, {r4-r7}
368 add r2, r2, r3
369 ldr r8, [r2]
370 b 1b
371 2:
372 add r0, r12, lr
373 pop {r4-r9, pc}
374 .endfunc
375
376 function ff_pix_abs16_x2_armv6, export=1
377 ldr r12, [sp]
378 push {r4-r11, lr}
379 mov r0, #0
380 mov lr, #1
381 orr lr, lr, lr, lsl #8
382 orr lr, lr, lr, lsl #16
383 1:
384 ldr r8, [r2]
385 ldr r9, [r2, #4]
386 lsr r10, r8, #8
387 ldr r4, [r1]
388 lsr r6, r9, #8
389 orr r10, r10, r9, lsl #24
390 ldr r5, [r2, #8]
391 eor r11, r8, r10
392 uhadd8 r7, r8, r10
393 orr r6, r6, r5, lsl #24
394 and r11, r11, lr
395 uadd8 r7, r7, r11
396 ldr r8, [r1, #4]
397 usada8 r0, r4, r7, r0
398 eor r7, r9, r6
399 lsr r10, r5, #8
400 and r7, r7, lr
401 uhadd8 r4, r9, r6
402 ldr r6, [r2, #12]
403 uadd8 r4, r4, r7
404 pld [r1, r3]
405 orr r10, r10, r6, lsl #24
406 usada8 r0, r8, r4, r0
407 ldr r4, [r1, #8]
408 eor r11, r5, r10
409 ldrb r7, [r2, #16]
410 and r11, r11, lr
411 uhadd8 r8, r5, r10
412 ldr r5, [r1, #12]
413 uadd8 r8, r8, r11
414 pld [r2, r3]
415 lsr r10, r6, #8
416 usada8 r0, r4, r8, r0
417 orr r10, r10, r7, lsl #24
418 subs r12, r12, #1
419 eor r11, r6, r10
420 add r1, r1, r3
421 uhadd8 r9, r6, r10
422 and r11, r11, lr
423 uadd8 r9, r9, r11
424 add r2, r2, r3
425 usada8 r0, r5, r9, r0
426 bgt 1b
427
428 pop {r4-r11, pc}
429 .endfunc
430
431 .macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3
432 ldr \n0, [r2]
433 eor \n1, \p0, \n0
434 uhadd8 \p0, \p0, \n0
435 and \n1, \n1, lr
436 ldr \n2, [r1]
437 uadd8 \p0, \p0, \n1
438 ldr \n1, [r2, #4]
439 usada8 r0, \p0, \n2, r0
440 pld [r1, r3]
441 eor \n3, \p1, \n1
442 uhadd8 \p1, \p1, \n1
443 and \n3, \n3, lr
444 ldr \p0, [r1, #4]
445 uadd8 \p1, \p1, \n3
446 ldr \n2, [r2, #8]
447 usada8 r0, \p1, \p0, r0
448 pld [r2, r3]
449 eor \p0, \p2, \n2
450 uhadd8 \p2, \p2, \n2
451 and \p0, \p0, lr
452 ldr \p1, [r1, #8]
453 uadd8 \p2, \p2, \p0
454 ldr \n3, [r2, #12]
455 usada8 r0, \p2, \p1, r0
456 eor \p1, \p3, \n3
457 uhadd8 \p3, \p3, \n3
458 and \p1, \p1, lr
459 ldr \p0, [r1, #12]
460 uadd8 \p3, \p3, \p1
461 add r1, r1, r3
462 usada8 r0, \p3, \p0, r0
463 add r2, r2, r3
464 .endm
465
466 function ff_pix_abs16_y2_armv6, export=1
467 pld [r1]
468 pld [r2]
469 ldr r12, [sp]
470 push {r4-r11, lr}
471 mov r0, #0
472 mov lr, #1
473 orr lr, lr, lr, lsl #8
474 orr lr, lr, lr, lsl #16
475 ldr r4, [r2]
476 ldr r5, [r2, #4]
477 ldr r6, [r2, #8]
478 ldr r7, [r2, #12]
479 add r2, r2, r3
480 1:
481 usad_y2 r4, r5, r6, r7, r8, r9, r10, r11
482 subs r12, r12, #2
483 usad_y2 r8, r9, r10, r11, r4, r5, r6, r7
484 bgt 1b
485
486 pop {r4-r11, pc}
487 .endfunc
488
489 function ff_pix_abs8_armv6, export=1
490 pld [r2, r3]
491 ldr r12, [sp]
492 push {r4-r9, lr}
493 mov r0, #0
494 mov lr, #0
495 ldrd r4, r5, [r1], r3
496 1:
497 subs r12, r12, #2
498 ldr r7, [r2, #4]
499 ldr r6, [r2], r3
500 ldrd r8, r9, [r1], r3
501 usada8 r0, r4, r6, r0
502 pld [r2, r3]
503 usada8 lr, r5, r7, lr
504 ldr r7, [r2, #4]
505 ldr r6, [r2], r3
506 beq 2f
507 ldrd r4, r5, [r1], r3
508 usada8 r0, r8, r6, r0
509 pld [r2, r3]
510 usada8 lr, r9, r7, lr
511 b 1b
512 2:
513 usada8 r0, r8, r6, r0
514 usada8 lr, r9, r7, lr
515 add r0, r0, lr
516 pop {r4-r9, pc}
517 .endfunc
518
519 function ff_sse16_armv6, export=1
520 ldr r12, [sp]
521 push {r4-r9, lr}
522 mov r0, #0
523 1:
524 ldrd r4, r5, [r1]
525 ldr r8, [r2]
526 uxtb16 lr, r4
527 uxtb16 r4, r4, ror #8
528 uxtb16 r9, r8
529 uxtb16 r8, r8, ror #8
530 ldr r7, [r2, #4]
531 usub16 lr, lr, r9
532 usub16 r4, r4, r8
533 smlad r0, lr, lr, r0
534 uxtb16 r6, r5
535 uxtb16 lr, r5, ror #8
536 uxtb16 r8, r7
537 uxtb16 r9, r7, ror #8
538 smlad r0, r4, r4, r0
539 ldrd r4, r5, [r1, #8]
540 usub16 r6, r6, r8
541 usub16 r8, lr, r9
542 ldr r7, [r2, #8]
543 smlad r0, r6, r6, r0
544 uxtb16 lr, r4
545 uxtb16 r4, r4, ror #8
546 uxtb16 r9, r7
547 uxtb16 r7, r7, ror #8
548 smlad r0, r8, r8, r0
549 ldr r8, [r2, #12]
550 usub16 lr, lr, r9
551 usub16 r4, r4, r7
552 smlad r0, lr, lr, r0
553 uxtb16 r6, r5
554 uxtb16 r5, r5, ror #8
555 uxtb16 r9, r8
556 uxtb16 r8, r8, ror #8
557 smlad r0, r4, r4, r0
558 usub16 r6, r6, r9
559 usub16 r5, r5, r8
560 smlad r0, r6, r6, r0
561 add r1, r1, r3
562 add r2, r2, r3
563 subs r12, r12, #1
564 smlad r0, r5, r5, r0
565 bgt 1b
566
567 pop {r4-r9, pc}
568 .endfunc
569
570 function ff_pix_norm1_armv6, export=1
571 push {r4-r6, lr}
572 mov r12, #16
573 mov lr, #0
574 1:
575 ldm r0, {r2-r5}
576 uxtb16 r6, r2
577 uxtb16 r2, r2, ror #8
578 smlad lr, r6, r6, lr
579 uxtb16 r6, r3
580 smlad lr, r2, r2, lr
581 uxtb16 r3, r3, ror #8
582 smlad lr, r6, r6, lr
583 uxtb16 r6, r4
584 smlad lr, r3, r3, lr
585 uxtb16 r4, r4, ror #8
586 smlad lr, r6, r6, lr
587 uxtb16 r6, r5
588 smlad lr, r4, r4, lr
589 uxtb16 r5, r5, ror #8
590 smlad lr, r6, r6, lr
591 subs r12, r12, #1
592 add r0, r0, r1
593 smlad lr, r5, r5, lr
594 bgt 1b
595
596 mov r0, lr
597 pop {r4-r6, pc}
598 .endfunc
599
600 function ff_pix_sum_armv6, export=1
601 push {r4-r7, lr}
602 mov r12, #16
603 mov r2, #0
604 mov r3, #0
605 mov lr, #0
606 ldr r4, [r0]
607 1:
608 subs r12, r12, #1
609 ldr r5, [r0, #4]
610 usada8 r2, r4, lr, r2
611 ldr r6, [r0, #8]
612 usada8 r3, r5, lr, r3
613 ldr r7, [r0, #12]
614 usada8 r2, r6, lr, r2
615 beq 2f
616 ldr r4, [r0, r1]!
617 usada8 r3, r7, lr, r3
618 bgt 1b
619 2:
620 usada8 r3, r7, lr, r3
621 add r0, r2, r3
622 pop {r4-r7, pc}
623 .endfunc