ARM: change alignment of loops in put_pixels*_arm to 32
[libav.git] / libavcodec / arm / dsputil_arm_s.S
1 @
2 @ ARMv4 optimized DSP utils
3 @ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
4 @
5 @ This file is part of FFmpeg.
6 @
7 @ FFmpeg is free software; you can redistribute it and/or
8 @ modify it under the terms of the GNU Lesser General Public
9 @ License as published by the Free Software Foundation; either
10 @ version 2.1 of the License, or (at your option) any later version.
11 @
12 @ FFmpeg is distributed in the hope that it will be useful,
13 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
14 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 @ Lesser General Public License for more details.
16 @
17 @ You should have received a copy of the GNU Lesser General Public
18 @ License along with FFmpeg; if not, write to the Free Software
19 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 @
21
22 #include "config.h"
23 #include "asm.S"
24
25 preserve8
26
27 #if !HAVE_PLD
28 .macro pld reg
29 .endm
30 #endif
31
32 #if HAVE_ARMV5TE
33 function ff_prefetch_arm, export=1
34 subs r2, r2, #1
35 pld [r0]
36 add r0, r0, r1
37 bne ff_prefetch_arm
38 bx lr
39 .endfunc
40 #endif
41
42 .macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
43 mov \Rd0, \Rn0, lsr #(\shift * 8)
44 mov \Rd1, \Rn1, lsr #(\shift * 8)
45 mov \Rd2, \Rn2, lsr #(\shift * 8)
46 mov \Rd3, \Rn3, lsr #(\shift * 8)
47 orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
48 orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
49 orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
50 orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
51 .endm
52 .macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
53 mov \R0, \R0, lsr #(\shift * 8)
54 orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
55 mov \R1, \R1, lsr #(\shift * 8)
56 orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
57 .endm
58 .macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
59 mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
60 mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
61 orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
62 orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
63 .endm
64
65 .macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
66 @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
67 @ Rmask = 0xFEFEFEFE
68 @ Rn = destroy
69 eor \Rd0, \Rn0, \Rm0
70 eor \Rd1, \Rn1, \Rm1
71 orr \Rn0, \Rn0, \Rm0
72 orr \Rn1, \Rn1, \Rm1
73 and \Rd0, \Rd0, \Rmask
74 and \Rd1, \Rd1, \Rmask
75 sub \Rd0, \Rn0, \Rd0, lsr #1
76 sub \Rd1, \Rn1, \Rd1, lsr #1
77 .endm
78
79 .macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
80 @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
81 @ Rmask = 0xFEFEFEFE
82 @ Rn = destroy
83 eor \Rd0, \Rn0, \Rm0
84 eor \Rd1, \Rn1, \Rm1
85 and \Rn0, \Rn0, \Rm0
86 and \Rn1, \Rn1, \Rm1
87 and \Rd0, \Rd0, \Rmask
88 and \Rd1, \Rd1, \Rmask
89 add \Rd0, \Rn0, \Rd0, lsr #1
90 add \Rd1, \Rn1, \Rd1, lsr #1
91 .endm
92
93 @ ----------------------------------------------------------------
94 .align 5
95 function put_pixels16_arm, export=1
96 @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
97 @ block = word aligned, pixles = unaligned
98 pld [r1]
99 stmfd sp!, {r4-r11, lr} @ R14 is also called LR
100 adr r5, 5f
101 ands r4, r1, #3
102 bic r1, r1, #3
103 add r5, r5, r4, lsl #2
104 ldrne pc, [r5]
105 1:
106 ldmia r1, {r4-r7}
107 add r1, r1, r2
108 stmia r0, {r4-r7}
109 pld [r1]
110 subs r3, r3, #1
111 add r0, r0, r2
112 bne 1b
113 ldmfd sp!, {r4-r11, pc}
114 .align 5
115 2:
116 ldmia r1, {r4-r8}
117 add r1, r1, r2
118 ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
119 pld [r1]
120 subs r3, r3, #1
121 stmia r0, {r9-r12}
122 add r0, r0, r2
123 bne 2b
124 ldmfd sp!, {r4-r11, pc}
125 .align 5
126 3:
127 ldmia r1, {r4-r8}
128 add r1, r1, r2
129 ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
130 pld [r1]
131 subs r3, r3, #1
132 stmia r0, {r9-r12}
133 add r0, r0, r2
134 bne 3b
135 ldmfd sp!, {r4-r11, pc}
136 .align 5
137 4:
138 ldmia r1, {r4-r8}
139 add r1, r1, r2
140 ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
141 pld [r1]
142 subs r3, r3, #1
143 stmia r0, {r9-r12}
144 add r0, r0, r2
145 bne 4b
146 ldmfd sp!, {r4-r11,pc}
147 5:
148 .word 1b
149 .word 2b
150 .word 3b
151 .word 4b
152 .endfunc
153
154 @ ----------------------------------------------------------------
155 .align 5
156 function put_pixels8_arm, export=1
157 @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
158 @ block = word aligned, pixles = unaligned
159 pld [r1]
160 stmfd sp!, {r4-r5,lr} @ R14 is also called LR
161 adr r5, 5f
162 ands r4, r1, #3
163 bic r1, r1, #3
164 add r5, r5, r4, lsl #2
165 ldrne pc, [r5]
166 1:
167 ldmia r1, {r4-r5}
168 add r1, r1, r2
169 subs r3, r3, #1
170 pld [r1]
171 stmia r0, {r4-r5}
172 add r0, r0, r2
173 bne 1b
174 ldmfd sp!, {r4-r5,pc}
175 .align 5
176 2:
177 ldmia r1, {r4-r5, r12}
178 add r1, r1, r2
179 ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
180 pld [r1]
181 subs r3, r3, #1
182 stmia r0, {r4-r5}
183 add r0, r0, r2
184 bne 2b
185 ldmfd sp!, {r4-r5,pc}
186 .align 5
187 3:
188 ldmia r1, {r4-r5, r12}
189 add r1, r1, r2
190 ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
191 pld [r1]
192 subs r3, r3, #1
193 stmia r0, {r4-r5}
194 add r0, r0, r2
195 bne 3b
196 ldmfd sp!, {r4-r5,pc}
197 .align 5
198 4:
199 ldmia r1, {r4-r5, r12}
200 add r1, r1, r2
201 ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
202 pld [r1]
203 subs r3, r3, #1
204 stmia r0, {r4-r5}
205 add r0, r0, r2
206 bne 4b
207 ldmfd sp!, {r4-r5,pc}
208 5:
209 .word 1b
210 .word 2b
211 .word 3b
212 .word 4b
213 .endfunc
214
215 @ ----------------------------------------------------------------
216 .align 5
217 function put_pixels8_x2_arm, export=1
218 @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
219 @ block = word aligned, pixles = unaligned
220 pld [r1]
221 stmfd sp!, {r4-r10,lr} @ R14 is also called LR
222 adr r5, 5f
223 ands r4, r1, #3
224 ldr r12, [r5]
225 add r5, r5, r4, lsl #2
226 bic r1, r1, #3
227 ldrne pc, [r5]
228 1:
229 ldmia r1, {r4-r5, r10}
230 add r1, r1, r2
231 ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
232 pld [r1]
233 RND_AVG32 r8, r9, r4, r5, r6, r7, r12
234 subs r3, r3, #1
235 stmia r0, {r8-r9}
236 add r0, r0, r2
237 bne 1b
238 ldmfd sp!, {r4-r10,pc}
239 .align 5
240 2:
241 ldmia r1, {r4-r5, r10}
242 add r1, r1, r2
243 ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
244 ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
245 pld [r1]
246 RND_AVG32 r4, r5, r6, r7, r8, r9, r12
247 subs r3, r3, #1
248 stmia r0, {r4-r5}
249 add r0, r0, r2
250 bne 2b
251 ldmfd sp!, {r4-r10,pc}
252 .align 5
253 3:
254 ldmia r1, {r4-r5, r10}
255 add r1, r1, r2
256 ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
257 ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
258 pld [r1]
259 RND_AVG32 r4, r5, r6, r7, r8, r9, r12
260 subs r3, r3, #1
261 stmia r0, {r4-r5}
262 add r0, r0, r2
263 bne 3b
264 ldmfd sp!, {r4-r10,pc}
265 .align 5
266 4:
267 ldmia r1, {r4-r5, r10}
268 add r1, r1, r2
269 ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
270 pld [r1]
271 RND_AVG32 r8, r9, r6, r7, r5, r10, r12
272 subs r3, r3, #1
273 stmia r0, {r8-r9}
274 add r0, r0, r2
275 bne 4b
276 ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
277 5:
278 .word 0xFEFEFEFE
279 .word 2b
280 .word 3b
281 .word 4b
282 .endfunc
283
284 .align 5
285 function put_no_rnd_pixels8_x2_arm, export=1
286 @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
287 @ block = word aligned, pixles = unaligned
288 pld [r1]
289 stmfd sp!, {r4-r10,lr} @ R14 is also called LR
290 adr r5, 5f
291 ands r4, r1, #3
292 ldr r12, [r5]
293 add r5, r5, r4, lsl #2
294 bic r1, r1, #3
295 ldrne pc, [r5]
296 1:
297 ldmia r1, {r4-r5, r10}
298 add r1, r1, r2
299 ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
300 pld [r1]
301 NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
302 subs r3, r3, #1
303 stmia r0, {r8-r9}
304 add r0, r0, r2
305 bne 1b
306 ldmfd sp!, {r4-r10,pc}
307 .align 5
308 2:
309 ldmia r1, {r4-r5, r10}
310 add r1, r1, r2
311 ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
312 ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
313 pld [r1]
314 NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
315 subs r3, r3, #1
316 stmia r0, {r4-r5}
317 add r0, r0, r2
318 bne 2b
319 ldmfd sp!, {r4-r10,pc}
320 .align 5
321 3:
322 ldmia r1, {r4-r5, r10}
323 add r1, r1, r2
324 ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
325 ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
326 pld [r1]
327 NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
328 subs r3, r3, #1
329 stmia r0, {r4-r5}
330 add r0, r0, r2
331 bne 3b
332 ldmfd sp!, {r4-r10,pc}
333 .align 5
334 4:
335 ldmia r1, {r4-r5, r10}
336 add r1, r1, r2
337 ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
338 pld [r1]
339 NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
340 subs r3, r3, #1
341 stmia r0, {r8-r9}
342 add r0, r0, r2
343 bne 4b
344 ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
345 5:
346 .word 0xFEFEFEFE
347 .word 2b
348 .word 3b
349 .word 4b
350 .endfunc
351
352
353 @ ----------------------------------------------------------------
354 .align 5
355 function put_pixels8_y2_arm, export=1
356 @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
357 @ block = word aligned, pixles = unaligned
358 pld [r1]
359 stmfd sp!, {r4-r11,lr} @ R14 is also called LR
360 adr r5, 5f
361 ands r4, r1, #3
362 mov r3, r3, lsr #1
363 ldr r12, [r5]
364 add r5, r5, r4, lsl #2
365 bic r1, r1, #3
366 ldrne pc, [r5]
367 1:
368 ldmia r1, {r4-r5}
369 add r1, r1, r2
370 6: ldmia r1, {r6-r7}
371 add r1, r1, r2
372 pld [r1]
373 RND_AVG32 r8, r9, r4, r5, r6, r7, r12
374 ldmia r1, {r4-r5}
375 add r1, r1, r2
376 stmia r0, {r8-r9}
377 add r0, r0, r2
378 pld [r1]
379 RND_AVG32 r8, r9, r6, r7, r4, r5, r12
380 subs r3, r3, #1
381 stmia r0, {r8-r9}
382 add r0, r0, r2
383 bne 6b
384 ldmfd sp!, {r4-r11,pc}
385 .align 5
386 2:
387 ldmia r1, {r4-r6}
388 add r1, r1, r2
389 pld [r1]
390 ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
391 6: ldmia r1, {r7-r9}
392 add r1, r1, r2
393 pld [r1]
394 ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
395 RND_AVG32 r10, r11, r4, r5, r7, r8, r12
396 stmia r0, {r10-r11}
397 add r0, r0, r2
398 ldmia r1, {r4-r6}
399 add r1, r1, r2
400 pld [r1]
401 ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
402 subs r3, r3, #1
403 RND_AVG32 r10, r11, r7, r8, r4, r5, r12
404 stmia r0, {r10-r11}
405 add r0, r0, r2
406 bne 6b
407 ldmfd sp!, {r4-r11,pc}
408 .align 5
409 3:
410 ldmia r1, {r4-r6}
411 add r1, r1, r2
412 pld [r1]
413 ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
414 6: ldmia r1, {r7-r9}
415 add r1, r1, r2
416 pld [r1]
417 ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
418 RND_AVG32 r10, r11, r4, r5, r7, r8, r12
419 stmia r0, {r10-r11}
420 add r0, r0, r2
421 ldmia r1, {r4-r6}
422 add r1, r1, r2
423 pld [r1]
424 ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
425 subs r3, r3, #1
426 RND_AVG32 r10, r11, r7, r8, r4, r5, r12
427 stmia r0, {r10-r11}
428 add r0, r0, r2
429 bne 6b
430 ldmfd sp!, {r4-r11,pc}
431 .align 5
432 4:
433 ldmia r1, {r4-r6}
434 add r1, r1, r2
435 pld [r1]
436 ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
437 6: ldmia r1, {r7-r9}
438 add r1, r1, r2
439 pld [r1]
440 ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
441 RND_AVG32 r10, r11, r4, r5, r7, r8, r12
442 stmia r0, {r10-r11}
443 add r0, r0, r2
444 ldmia r1, {r4-r6}
445 add r1, r1, r2
446 pld [r1]
447 ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
448 subs r3, r3, #1
449 RND_AVG32 r10, r11, r7, r8, r4, r5, r12
450 stmia r0, {r10-r11}
451 add r0, r0, r2
452 bne 6b
453 ldmfd sp!, {r4-r11,pc}
454
455 5:
456 .word 0xFEFEFEFE
457 .word 2b
458 .word 3b
459 .word 4b
460 .endfunc
461
462 .align 5
463 function put_no_rnd_pixels8_y2_arm, export=1
464 @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
465 @ block = word aligned, pixles = unaligned
466 pld [r1]
467 stmfd sp!, {r4-r11,lr} @ R14 is also called LR
468 adr r5, 5f
469 ands r4, r1, #3
470 mov r3, r3, lsr #1
471 ldr r12, [r5]
472 add r5, r5, r4, lsl #2
473 bic r1, r1, #3
474 ldrne pc, [r5]
475 1:
476 ldmia r1, {r4-r5}
477 add r1, r1, r2
478 6: ldmia r1, {r6-r7}
479 add r1, r1, r2
480 pld [r1]
481 NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
482 ldmia r1, {r4-r5}
483 add r1, r1, r2
484 stmia r0, {r8-r9}
485 add r0, r0, r2
486 pld [r1]
487 NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
488 subs r3, r3, #1
489 stmia r0, {r8-r9}
490 add r0, r0, r2
491 bne 6b
492 ldmfd sp!, {r4-r11,pc}
493 .align 5
494 2:
495 ldmia r1, {r4-r6}
496 add r1, r1, r2
497 pld [r1]
498 ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
499 6: ldmia r1, {r7-r9}
500 add r1, r1, r2
501 pld [r1]
502 ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
503 NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
504 stmia r0, {r10-r11}
505 add r0, r0, r2
506 ldmia r1, {r4-r6}
507 add r1, r1, r2
508 pld [r1]
509 ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
510 subs r3, r3, #1
511 NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
512 stmia r0, {r10-r11}
513 add r0, r0, r2
514 bne 6b
515 ldmfd sp!, {r4-r11,pc}
516 .align 5
517 3:
518 ldmia r1, {r4-r6}
519 add r1, r1, r2
520 pld [r1]
521 ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
522 6: ldmia r1, {r7-r9}
523 add r1, r1, r2
524 pld [r1]
525 ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
526 NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
527 stmia r0, {r10-r11}
528 add r0, r0, r2
529 ldmia r1, {r4-r6}
530 add r1, r1, r2
531 pld [r1]
532 ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
533 subs r3, r3, #1
534 NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
535 stmia r0, {r10-r11}
536 add r0, r0, r2
537 bne 6b
538 ldmfd sp!, {r4-r11,pc}
539 .align 5
540 4:
541 ldmia r1, {r4-r6}
542 add r1, r1, r2
543 pld [r1]
544 ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
545 6: ldmia r1, {r7-r9}
546 add r1, r1, r2
547 pld [r1]
548 ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
549 NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
550 stmia r0, {r10-r11}
551 add r0, r0, r2
552 ldmia r1, {r4-r6}
553 add r1, r1, r2
554 pld [r1]
555 ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
556 subs r3, r3, #1
557 NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
558 stmia r0, {r10-r11}
559 add r0, r0, r2
560 bne 6b
561 ldmfd sp!, {r4-r11,pc}
562 5:
563 .word 0xFEFEFEFE
564 .word 2b
565 .word 3b
566 .word 4b
567 .endfunc
568
569 @ ----------------------------------------------------------------
570 .macro RND_XY2_IT align
571 @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
572 @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
573 .if \align == 0
574 ldmia r1, {r6-r8}
575 .elseif \align == 3
576 ldmia r1, {r5-r7}
577 .else
578 ldmia r1, {r8-r10}
579 .endif
580 add r1, r1, r2
581 pld [r1]
582 .if \align == 0
583 ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
584 .elseif \align == 1
585 ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
586 ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
587 .elseif \align == 2
588 ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
589 ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
590 .elseif \align == 3
591 ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
592 .endif
593 ldr r14, [r12, #0] @ 0x03030303
594 tst r3, #1
595 and r8, r4, r14
596 and r9, r5, r14
597 and r10, r6, r14
598 and r11, r7, r14
599 ldreq r14, [r12, #16] @ 0x02020202/0x01010101
600 add r8, r8, r10
601 add r9, r9, r11
602 addeq r8, r8, r14
603 addeq r9, r9, r14
604 ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2
605 and r4, r14, r4, lsr #2
606 and r5, r14, r5, lsr #2
607 and r6, r14, r6, lsr #2
608 and r7, r14, r7, lsr #2
609 add r10, r4, r6
610 add r11, r5, r7
611 subs r3, r3, #1
612 .endm
613
614 .macro RND_XY2_EXPAND align
615 RND_XY2_IT \align
616 6: stmfd sp!, {r8-r11}
617 RND_XY2_IT \align
618 ldmfd sp!, {r4-r7}
619 add r4, r4, r8
620 add r5, r5, r9
621 add r6, r6, r10
622 add r7, r7, r11
623 ldr r14, [r12, #24] @ 0x0F0F0F0F
624 and r4, r14, r4, lsr #2
625 and r5, r14, r5, lsr #2
626 add r4, r4, r6
627 add r5, r5, r7
628 stmia r0, {r4-r5}
629 add r0, r0, r2
630 bge 6b
631 ldmfd sp!, {r4-r11,pc}
632 .endm
633
634 .align 5
635 function put_pixels8_xy2_arm, export=1
636 @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
637 @ block = word aligned, pixles = unaligned
638 pld [r1]
639 stmfd sp!, {r4-r11,lr} @ R14 is also called LR
640 adrl r12, 5f
641 ands r4, r1, #3
642 add r5, r12, r4, lsl #2
643 bic r1, r1, #3
644 ldrne pc, [r5]
645 1:
646 RND_XY2_EXPAND 0
647
648 .align 5
649 2:
650 RND_XY2_EXPAND 1
651
652 .align 5
653 3:
654 RND_XY2_EXPAND 2
655
656 .align 5
657 4:
658 RND_XY2_EXPAND 3
659
660 5:
661 .word 0x03030303
662 .word 2b
663 .word 3b
664 .word 4b
665 .word 0x02020202
666 .word 0xFCFCFCFC >> 2
667 .word 0x0F0F0F0F
668 .endfunc
669
670 .align 5
671 function put_no_rnd_pixels8_xy2_arm, export=1
672 @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
673 @ block = word aligned, pixles = unaligned
674 pld [r1]
675 stmfd sp!, {r4-r11,lr} @ R14 is also called LR
676 adrl r12, 5f
677 ands r4, r1, #3
678 add r5, r12, r4, lsl #2
679 bic r1, r1, #3
680 ldrne pc, [r5]
681 1:
682 RND_XY2_EXPAND 0
683
684 .align 5
685 2:
686 RND_XY2_EXPAND 1
687
688 .align 5
689 3:
690 RND_XY2_EXPAND 2
691
692 .align 5
693 4:
694 RND_XY2_EXPAND 3
695
696 5:
697 .word 0x03030303
698 .word 2b
699 .word 3b
700 .word 4b
701 .word 0x01010101
702 .word 0xFCFCFCFC >> 2
703 .word 0x0F0F0F0F
704 .endfunc
705
706 .align 5
707 @ void ff_add_pixels_clamped_ARM(int16_t *block, uint8_t *dest, int stride)
708 function ff_add_pixels_clamped_ARM, export=1
709 push {r4-r10}
710 mov r10, #8
711 1:
712 ldr r4, [r1] /* load dest */
713 /* block[0] and block[1]*/
714 ldrsh r5, [r0]
715 ldrsh r7, [r0, #2]
716 and r6, r4, #0xFF
717 and r8, r4, #0xFF00
718 add r6, r5, r6
719 add r8, r7, r8, lsr #8
720 mvn r5, r5
721 mvn r7, r7
722 tst r6, #0x100
723 movne r6, r5, lsr #24
724 tst r8, #0x100
725 movne r8, r7, lsr #24
726 mov r9, r6
727 ldrsh r5, [r0, #4] /* moved form [A] */
728 orr r9, r9, r8, lsl #8
729 /* block[2] and block[3] */
730 /* [A] */
731 ldrsh r7, [r0, #6]
732 and r6, r4, #0xFF0000
733 and r8, r4, #0xFF000000
734 add r6, r5, r6, lsr #16
735 add r8, r7, r8, lsr #24
736 mvn r5, r5
737 mvn r7, r7
738 tst r6, #0x100
739 movne r6, r5, lsr #24
740 tst r8, #0x100
741 movne r8, r7, lsr #24
742 orr r9, r9, r6, lsl #16
743 ldr r4, [r1, #4] /* moved form [B] */
744 orr r9, r9, r8, lsl #24
745 /* store dest */
746 ldrsh r5, [r0, #8] /* moved form [C] */
747 str r9, [r1]
748
749 /* load dest */
750 /* [B] */
751 /* block[4] and block[5] */
752 /* [C] */
753 ldrsh r7, [r0, #10]
754 and r6, r4, #0xFF
755 and r8, r4, #0xFF00
756 add r6, r5, r6
757 add r8, r7, r8, lsr #8
758 mvn r5, r5
759 mvn r7, r7
760 tst r6, #0x100
761 movne r6, r5, lsr #24
762 tst r8, #0x100
763 movne r8, r7, lsr #24
764 mov r9, r6
765 ldrsh r5, [r0, #12] /* moved from [D] */
766 orr r9, r9, r8, lsl #8
767 /* block[6] and block[7] */
768 /* [D] */
769 ldrsh r7, [r0, #14]
770 and r6, r4, #0xFF0000
771 and r8, r4, #0xFF000000
772 add r6, r5, r6, lsr #16
773 add r8, r7, r8, lsr #24
774 mvn r5, r5
775 mvn r7, r7
776 tst r6, #0x100
777 movne r6, r5, lsr #24
778 tst r8, #0x100
779 movne r8, r7, lsr #24
780 orr r9, r9, r6, lsl #16
781 add r0, r0, #16 /* moved from [E] */
782 orr r9, r9, r8, lsl #24
783 subs r10, r10, #1 /* moved from [F] */
784 /* store dest */
785 str r9, [r1, #4]
786
787 /* [E] */
788 /* [F] */
789 add r1, r1, r2
790 bne 1b
791
792 pop {r4-r10}
793 bx lr
794 .endfunc