acdfa12fdc9dfe2fc42b9180112e853d78f0f232
[libav.git] / libavcodec / arm / dsputil_armv6.S
1 /*
2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "asm.S"
22
23 .text
24
25 .macro call_2x_pixels type, subp
26 function ff_\type\()_pixels16\subp\()_armv6, export=1
27 push {r0-r3, lr}
28 bl ff_\type\()_pixels8\subp\()_armv6
29 pop {r0-r3, lr}
30 add r0, r0, #8
31 add r1, r1, #8
32 b ff_\type\()_pixels8\subp\()_armv6
33 .endfunc
34 .endm
35
36 call_2x_pixels avg
37 call_2x_pixels put, _x2
38 call_2x_pixels put, _y2
39 call_2x_pixels put, _x2_no_rnd
40 call_2x_pixels put, _y2_no_rnd
41
42 function ff_put_pixels16_armv6, export=1
43 push {r4-r11}
44 1:
45 ldr r5, [r1, #4]
46 ldr r6, [r1, #8]
47 ldr r7, [r1, #12]
48 ldr r4, [r1], r2
49 strd r6, r7, [r0, #8]
50 ldr r9, [r1, #4]
51 strd r4, r5, [r0], r2
52 ldr r10, [r1, #8]
53 ldr r11, [r1, #12]
54 ldr r8, [r1], r2
55 strd r10, r11, [r0, #8]
56 subs r3, r3, #2
57 strd r8, r9, [r0], r2
58 bne 1b
59
60 pop {r4-r11}
61 bx lr
62 .endfunc
63
64 function ff_put_pixels8_armv6, export=1
65 push {r4-r7}
66 1:
67 ldr r5, [r1, #4]
68 ldr r4, [r1], r2
69 ldr r7, [r1, #4]
70 strd r4, r5, [r0], r2
71 ldr r6, [r1], r2
72 subs r3, r3, #2
73 strd r6, r7, [r0], r2
74 bne 1b
75
76 pop {r4-r7}
77 bx lr
78 .endfunc
79
80 function ff_put_pixels8_x2_armv6, export=1
81 push {r4-r11, lr}
82 mov r12, #1
83 orr r12, r12, r12, lsl #8
84 orr r12, r12, r12, lsl #16
85 1:
86 ldr r4, [r1]
87 subs r3, r3, #2
88 ldr r5, [r1, #4]
89 ldr r7, [r1, #5]
90 lsr r6, r4, #8
91 ldr r8, [r1, r2]!
92 orr r6, r6, r5, lsl #24
93 ldr r9, [r1, #4]
94 ldr r11, [r1, #5]
95 lsr r10, r8, #8
96 add r1, r1, r2
97 orr r10, r10, r9, lsl #24
98 eor r14, r4, r6
99 uhadd8 r4, r4, r6
100 eor r6, r5, r7
101 uhadd8 r5, r5, r7
102 and r14, r14, r12
103 and r6, r6, r12
104 uadd8 r4, r4, r14
105 eor r14, r8, r10
106 uadd8 r5, r5, r6
107 eor r6, r9, r11
108 uhadd8 r8, r8, r10
109 and r14, r14, r12
110 uhadd8 r9, r9, r11
111 and r6, r6, r12
112 uadd8 r8, r8, r14
113 strd r4, r5, [r0], r2
114 uadd8 r9, r9, r6
115 strd r8, r9, [r0], r2
116 bne 1b
117
118 pop {r4-r11, pc}
119 .endfunc
120
121 function ff_put_pixels8_y2_armv6, export=1
122 push {r4-r11}
123 mov r12, #1
124 orr r12, r12, r12, lsl #8
125 orr r12, r12, r12, lsl #16
126 ldr r4, [r1]
127 ldr r5, [r1, #4]
128 ldr r6, [r1, r2]!
129 ldr r7, [r1, #4]
130 1:
131 subs r3, r3, #2
132 uhadd8 r8, r4, r6
133 eor r10, r4, r6
134 uhadd8 r9, r5, r7
135 eor r11, r5, r7
136 and r10, r10, r12
137 ldr r4, [r1, r2]!
138 uadd8 r8, r8, r10
139 and r11, r11, r12
140 uadd8 r9, r9, r11
141 ldr r5, [r1, #4]
142 uhadd8 r10, r4, r6
143 eor r6, r4, r6
144 uhadd8 r11, r5, r7
145 and r6, r6, r12
146 eor r7, r5, r7
147 uadd8 r10, r10, r6
148 and r7, r7, r12
149 ldr r6, [r1, r2]!
150 uadd8 r11, r11, r7
151 strd r8, r9, [r0], r2
152 ldr r7, [r1, #4]
153 strd r10, r11, [r0], r2
154 bne 1b
155
156 pop {r4-r11}
157 bx lr
158 .endfunc
159
160 function ff_put_pixels8_x2_no_rnd_armv6, export=1
161 push {r4-r9, lr}
162 1:
163 subs r3, r3, #2
164 ldr r4, [r1]
165 ldr r5, [r1, #4]
166 ldr r7, [r1, #5]
167 ldr r8, [r1, r2]!
168 ldr r9, [r1, #4]
169 ldr r14, [r1, #5]
170 add r1, r1, r2
171 lsr r6, r4, #8
172 orr r6, r6, r5, lsl #24
173 lsr r12, r8, #8
174 orr r12, r12, r9, lsl #24
175 uhadd8 r4, r4, r6
176 uhadd8 r5, r5, r7
177 uhadd8 r8, r8, r12
178 uhadd8 r9, r9, r14
179 stm r0, {r4,r5}
180 add r0, r0, r2
181 stm r0, {r8,r9}
182 add r0, r0, r2
183 bne 1b
184
185 pop {r4-r9, pc}
186 .endfunc
187
188 function ff_put_pixels8_y2_no_rnd_armv6, export=1
189 push {r4-r9, lr}
190 ldr r4, [r1]
191 ldr r5, [r1, #4]
192 ldr r6, [r1, r2]!
193 ldr r7, [r1, #4]
194 1:
195 subs r3, r3, #2
196 uhadd8 r8, r4, r6
197 ldr r4, [r1, r2]!
198 uhadd8 r9, r5, r7
199 ldr r5, [r1, #4]
200 uhadd8 r12, r4, r6
201 ldr r6, [r1, r2]!
202 uhadd8 r14, r5, r7
203 ldr r7, [r1, #4]
204 stm r0, {r8,r9}
205 add r0, r0, r2
206 stm r0, {r12,r14}
207 add r0, r0, r2
208 bne 1b
209
210 pop {r4-r9, pc}
211 .endfunc
212
213 function ff_avg_pixels8_armv6, export=1
214 pld [r1, r2]
215 push {r4-r10, lr}
216 mov lr, #1
217 orr lr, lr, lr, lsl #8
218 orr lr, lr, lr, lsl #16
219 ldrd r4, r5, [r0]
220 ldr r10, [r1, #4]
221 ldr r9, [r1], r2
222 subs r3, r3, #2
223 1:
224 pld [r1, r2]
225 eor r8, r4, r9
226 uhadd8 r4, r4, r9
227 eor r12, r5, r10
228 ldrd r6, r7, [r0, r2]
229 uhadd8 r5, r5, r10
230 and r8, r8, lr
231 ldr r10, [r1, #4]
232 and r12, r12, lr
233 uadd8 r4, r4, r8
234 ldr r9, [r1], r2
235 eor r8, r6, r9
236 uadd8 r5, r5, r12
237 pld [r1, r2, lsl #1]
238 eor r12, r7, r10
239 uhadd8 r6, r6, r9
240 strd r4, r5, [r0], r2
241 uhadd8 r7, r7, r10
242 beq 2f
243 and r8, r8, lr
244 ldrd r4, r5, [r0, r2]
245 uadd8 r6, r6, r8
246 ldr r10, [r1, #4]
247 and r12, r12, lr
248 subs r3, r3, #2
249 uadd8 r7, r7, r12
250 ldr r9, [r1], r2
251 strd r6, r7, [r0], r2
252 b 1b
253 2:
254 and r8, r8, lr
255 and r12, r12, lr
256 uadd8 r6, r6, r8
257 uadd8 r7, r7, r12
258 strd r6, r7, [r0], r2
259
260 pop {r4-r10, pc}
261 .endfunc
262
263 function ff_add_pixels_clamped_armv6, export=1
264 push {r4-r8,lr}
265 mov r3, #8
266 1:
267 ldm r0!, {r4,r5,r12,lr}
268 ldrd r6, r7, [r1]
269 pkhbt r8, r4, r5, lsl #16
270 pkhtb r5, r5, r4, asr #16
271 pkhbt r4, r12, lr, lsl #16
272 pkhtb lr, lr, r12, asr #16
273 pld [r1, r2]
274 uxtab16 r8, r8, r6
275 uxtab16 r5, r5, r6, ror #8
276 uxtab16 r4, r4, r7
277 uxtab16 lr, lr, r7, ror #8
278 usat16 r8, #8, r8
279 usat16 r5, #8, r5
280 usat16 r4, #8, r4
281 usat16 lr, #8, lr
282 orr r6, r8, r5, lsl #8
283 orr r7, r4, lr, lsl #8
284 subs r3, r3, #1
285 strd r6, r7, [r1], r2
286 bgt 1b
287 pop {r4-r8,pc}
288 .endfunc
289
290 function ff_get_pixels_armv6, export=1
291 pld [r1, r2]
292 push {r4-r8, lr}
293 mov lr, #8
294 1:
295 ldrd r4, r5, [r1], r2
296 subs lr, lr, #1
297 uxtb16 r6, r4
298 uxtb16 r4, r4, ror #8
299 uxtb16 r12, r5
300 uxtb16 r8, r5, ror #8
301 pld [r1, r2]
302 pkhbt r5, r6, r4, lsl #16
303 pkhtb r6, r4, r6, asr #16
304 pkhbt r7, r12, r8, lsl #16
305 pkhtb r12, r8, r12, asr #16
306 stm r0!, {r5,r6,r7,r12}
307 bgt 1b
308
309 pop {r4-r8, pc}
310 .endfunc
311
312 function ff_diff_pixels_armv6, export=1
313 pld [r1, r3]
314 pld [r2, r3]
315 push {r4-r9, lr}
316 mov lr, #8
317 1:
318 ldrd r4, r5, [r1], r3
319 ldrd r6, r7, [r2], r3
320 uxtb16 r8, r4
321 uxtb16 r4, r4, ror #8
322 uxtb16 r9, r6
323 uxtb16 r6, r6, ror #8
324 pld [r1, r3]
325 ssub16 r9, r8, r9
326 ssub16 r6, r4, r6
327 uxtb16 r8, r5
328 uxtb16 r5, r5, ror #8
329 pld [r2, r3]
330 pkhbt r4, r9, r6, lsl #16
331 pkhtb r6, r6, r9, asr #16
332 uxtb16 r9, r7
333 uxtb16 r7, r7, ror #8
334 ssub16 r9, r8, r9
335 ssub16 r5, r5, r7
336 subs lr, lr, #1
337 pkhbt r8, r9, r5, lsl #16
338 pkhtb r9, r5, r9, asr #16
339 stm r0!, {r4,r6,r8,r9}
340 bgt 1b
341
342 pop {r4-r9, pc}
343 .endfunc
344
345 function ff_pix_abs16_armv6, export=1
346 ldr r0, [sp]
347 push {r4-r9, lr}
348 mov r12, #0
349 mov lr, #0
350 ldm r1, {r4-r7}
351 ldr r8, [r2]
352 1:
353 ldr r9, [r2, #4]
354 pld [r1, r3]
355 usada8 r12, r4, r8, r12
356 ldr r8, [r2, #8]
357 pld [r2, r3]
358 usada8 lr, r5, r9, lr
359 ldr r9, [r2, #12]
360 usada8 r12, r6, r8, r12
361 subs r0, r0, #1
362 usada8 lr, r7, r9, lr
363 beq 2f
364 add r1, r1, r3
365 ldm r1, {r4-r7}
366 add r2, r2, r3
367 ldr r8, [r2]
368 b 1b
369 2:
370 add r0, r12, lr
371 pop {r4-r9, pc}
372 .endfunc
373
374 function ff_pix_abs16_x2_armv6, export=1
375 ldr r12, [sp]
376 push {r4-r11, lr}
377 mov r0, #0
378 mov lr, #1
379 orr lr, lr, lr, lsl #8
380 orr lr, lr, lr, lsl #16
381 1:
382 ldr r8, [r2]
383 ldr r9, [r2, #4]
384 lsr r10, r8, #8
385 ldr r4, [r1]
386 lsr r6, r9, #8
387 orr r10, r10, r9, lsl #24
388 ldr r5, [r2, #8]
389 eor r11, r8, r10
390 uhadd8 r7, r8, r10
391 orr r6, r6, r5, lsl #24
392 and r11, r11, lr
393 uadd8 r7, r7, r11
394 ldr r8, [r1, #4]
395 usada8 r0, r4, r7, r0
396 eor r7, r9, r6
397 lsr r10, r5, #8
398 and r7, r7, lr
399 uhadd8 r4, r9, r6
400 ldr r6, [r2, #12]
401 uadd8 r4, r4, r7
402 pld [r1, r3]
403 orr r10, r10, r6, lsl #24
404 usada8 r0, r8, r4, r0
405 ldr r4, [r1, #8]
406 eor r11, r5, r10
407 ldrb r7, [r2, #16]
408 and r11, r11, lr
409 uhadd8 r8, r5, r10
410 ldr r5, [r1, #12]
411 uadd8 r8, r8, r11
412 pld [r2, r3]
413 lsr r10, r6, #8
414 usada8 r0, r4, r8, r0
415 orr r10, r10, r7, lsl #24
416 subs r12, r12, #1
417 eor r11, r6, r10
418 add r1, r1, r3
419 uhadd8 r9, r6, r10
420 and r11, r11, lr
421 uadd8 r9, r9, r11
422 add r2, r2, r3
423 usada8 r0, r5, r9, r0
424 bgt 1b
425
426 pop {r4-r11, pc}
427 .endfunc
428
429 .macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3
430 ldr \n0, [r2]
431 eor \n1, \p0, \n0
432 uhadd8 \p0, \p0, \n0
433 and \n1, \n1, lr
434 ldr \n2, [r1]
435 uadd8 \p0, \p0, \n1
436 ldr \n1, [r2, #4]
437 usada8 r0, \p0, \n2, r0
438 pld [r1, r3]
439 eor \n3, \p1, \n1
440 uhadd8 \p1, \p1, \n1
441 and \n3, \n3, lr
442 ldr \p0, [r1, #4]
443 uadd8 \p1, \p1, \n3
444 ldr \n2, [r2, #8]
445 usada8 r0, \p1, \p0, r0
446 pld [r2, r3]
447 eor \p0, \p2, \n2
448 uhadd8 \p2, \p2, \n2
449 and \p0, \p0, lr
450 ldr \p1, [r1, #8]
451 uadd8 \p2, \p2, \p0
452 ldr \n3, [r2, #12]
453 usada8 r0, \p2, \p1, r0
454 eor \p1, \p3, \n3
455 uhadd8 \p3, \p3, \n3
456 and \p1, \p1, lr
457 ldr \p0, [r1, #12]
458 uadd8 \p3, \p3, \p1
459 add r1, r1, r3
460 usada8 r0, \p3, \p0, r0
461 add r2, r2, r3
462 .endm
463
464 function ff_pix_abs16_y2_armv6, export=1
465 pld [r1]
466 pld [r2]
467 ldr r12, [sp]
468 push {r4-r11, lr}
469 mov r0, #0
470 mov lr, #1
471 orr lr, lr, lr, lsl #8
472 orr lr, lr, lr, lsl #16
473 ldr r4, [r2]
474 ldr r5, [r2, #4]
475 ldr r6, [r2, #8]
476 ldr r7, [r2, #12]
477 add r2, r2, r3
478 1:
479 usad_y2 r4, r5, r6, r7, r8, r9, r10, r11
480 subs r12, r12, #2
481 usad_y2 r8, r9, r10, r11, r4, r5, r6, r7
482 bgt 1b
483
484 pop {r4-r11, pc}
485 .endfunc
486
487 function ff_pix_abs8_armv6, export=1
488 pld [r2, r3]
489 ldr r12, [sp]
490 push {r4-r9, lr}
491 mov r0, #0
492 mov lr, #0
493 ldrd r4, r5, [r1], r3
494 1:
495 subs r12, r12, #2
496 ldr r7, [r2, #4]
497 ldr r6, [r2], r3
498 ldrd r8, r9, [r1], r3
499 usada8 r0, r4, r6, r0
500 pld [r2, r3]
501 usada8 lr, r5, r7, lr
502 ldr r7, [r2, #4]
503 ldr r6, [r2], r3
504 beq 2f
505 ldrd r4, r5, [r1], r3
506 usada8 r0, r8, r6, r0
507 pld [r2, r3]
508 usada8 lr, r9, r7, lr
509 b 1b
510 2:
511 usada8 r0, r8, r6, r0
512 usada8 lr, r9, r7, lr
513 add r0, r0, lr
514 pop {r4-r9, pc}
515 .endfunc
516
517 function ff_sse16_armv6, export=1
518 ldr r12, [sp]
519 push {r4-r9, lr}
520 mov r0, #0
521 1:
522 ldrd r4, r5, [r1]
523 ldr r8, [r2]
524 uxtb16 lr, r4
525 uxtb16 r4, r4, ror #8
526 uxtb16 r9, r8
527 uxtb16 r8, r8, ror #8
528 ldr r7, [r2, #4]
529 usub16 lr, lr, r9
530 usub16 r4, r4, r8
531 smlad r0, lr, lr, r0
532 uxtb16 r6, r5
533 uxtb16 lr, r5, ror #8
534 uxtb16 r8, r7
535 uxtb16 r9, r7, ror #8
536 smlad r0, r4, r4, r0
537 ldrd r4, r5, [r1, #8]
538 usub16 r6, r6, r8
539 usub16 r8, lr, r9
540 ldr r7, [r2, #8]
541 smlad r0, r6, r6, r0
542 uxtb16 lr, r4
543 uxtb16 r4, r4, ror #8
544 uxtb16 r9, r7
545 uxtb16 r7, r7, ror #8
546 smlad r0, r8, r8, r0
547 ldr r8, [r2, #12]
548 usub16 lr, lr, r9
549 usub16 r4, r4, r7
550 smlad r0, lr, lr, r0
551 uxtb16 r6, r5
552 uxtb16 r5, r5, ror #8
553 uxtb16 r9, r8
554 uxtb16 r8, r8, ror #8
555 smlad r0, r4, r4, r0
556 usub16 r6, r6, r9
557 usub16 r5, r5, r8
558 smlad r0, r6, r6, r0
559 add r1, r1, r3
560 add r2, r2, r3
561 subs r12, r12, #1
562 smlad r0, r5, r5, r0
563 bgt 1b
564
565 pop {r4-r9, pc}
566 .endfunc
567
568 function ff_pix_norm1_armv6, export=1
569 push {r4-r6, lr}
570 mov r12, #16
571 mov lr, #0
572 1:
573 ldm r0, {r2-r5}
574 uxtb16 r6, r2
575 uxtb16 r2, r2, ror #8
576 smlad lr, r6, r6, lr
577 uxtb16 r6, r3
578 smlad lr, r2, r2, lr
579 uxtb16 r3, r3, ror #8
580 smlad lr, r6, r6, lr
581 uxtb16 r6, r4
582 smlad lr, r3, r3, lr
583 uxtb16 r4, r4, ror #8
584 smlad lr, r6, r6, lr
585 uxtb16 r6, r5
586 smlad lr, r4, r4, lr
587 uxtb16 r5, r5, ror #8
588 smlad lr, r6, r6, lr
589 subs r12, r12, #1
590 add r0, r0, r1
591 smlad lr, r5, r5, lr
592 bgt 1b
593
594 mov r0, lr
595 pop {r4-r6, pc}
596 .endfunc
597
598 function ff_pix_sum_armv6, export=1
599 push {r4-r7, lr}
600 mov r12, #16
601 mov r2, #0
602 mov r3, #0
603 mov lr, #0
604 ldr r4, [r0]
605 1:
606 subs r12, r12, #1
607 ldr r5, [r0, #4]
608 usada8 r2, r4, lr, r2
609 ldr r6, [r0, #8]
610 usada8 r3, r5, lr, r3
611 ldr r7, [r0, #12]
612 usada8 r2, r6, lr, r2
613 beq 2f
614 ldr r4, [r0, r1]!
615 usada8 r3, r7, lr, r3
616 bgt 1b
617 2:
618 usada8 r3, r7, lr, r3
619 add r0, r2, r3
620 pop {r4-r7, pc}
621 .endfunc