Add missing multiple inclusion guards.
[libav.git] / libavcodec / armv4l / dsputil_iwmmxt_rnd.h
1 /*
2 * iWMMXt optimized DSP utils
3 * copyright (c) 2004 AGAWA Koji
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #ifndef FFMPEG_DSPUTIL_IWMMXT_RND_H
23 #define FFMPEG_DSPUTIL_IWMMXT_RND_H
24
25 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
26 {
27 int stride = line_size;
28 __asm__ __volatile__ (
29 "and r12, %[pixels], #7 \n\t"
30 "bic %[pixels], %[pixels], #7 \n\t"
31 "tmcr wcgr1, r12 \n\t"
32 "add r4, %[pixels], %[line_size] \n\t"
33 "add r5, %[block], %[line_size] \n\t"
34 "mov %[line_size], %[line_size], lsl #1 \n\t"
35 "1: \n\t"
36 "wldrd wr0, [%[pixels]] \n\t"
37 "subs %[h], %[h], #2 \n\t"
38 "wldrd wr1, [%[pixels], #8] \n\t"
39 "add %[pixels], %[pixels], %[line_size] \n\t"
40 "wldrd wr3, [r4] \n\t"
41 "pld [%[pixels]] \n\t"
42 "pld [%[pixels], #32] \n\t"
43 "wldrd wr4, [r4, #8] \n\t"
44 "add r4, r4, %[line_size] \n\t"
45 "walignr1 wr8, wr0, wr1 \n\t"
46 "pld [r4] \n\t"
47 "pld [r4, #32] \n\t"
48 "walignr1 wr10, wr3, wr4 \n\t"
49 "wstrd wr8, [%[block]] \n\t"
50 "add %[block], %[block], %[line_size] \n\t"
51 "wstrd wr10, [r5] \n\t"
52 "add r5, r5, %[line_size] \n\t"
53 "bne 1b \n\t"
54 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
55 :
56 : "memory", "r4", "r5", "r12");
57 }
58
59 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
60 {
61 int stride = line_size;
62 __asm__ __volatile__ (
63 "and r12, %[pixels], #7 \n\t"
64 "bic %[pixels], %[pixels], #7 \n\t"
65 "tmcr wcgr1, r12 \n\t"
66 "add r4, %[pixels], %[line_size] \n\t"
67 "add r5, %[block], %[line_size] \n\t"
68 "mov %[line_size], %[line_size], lsl #1 \n\t"
69 "1: \n\t"
70 "wldrd wr0, [%[pixels]] \n\t"
71 "subs %[h], %[h], #2 \n\t"
72 "wldrd wr1, [%[pixels], #8] \n\t"
73 "add %[pixels], %[pixels], %[line_size] \n\t"
74 "wldrd wr3, [r4] \n\t"
75 "pld [%[pixels]] \n\t"
76 "pld [%[pixels], #32] \n\t"
77 "wldrd wr4, [r4, #8] \n\t"
78 "add r4, r4, %[line_size] \n\t"
79 "walignr1 wr8, wr0, wr1 \n\t"
80 "wldrd wr0, [%[block]] \n\t"
81 "wldrd wr2, [r5] \n\t"
82 "pld [r4] \n\t"
83 "pld [r4, #32] \n\t"
84 "walignr1 wr10, wr3, wr4 \n\t"
85 WAVG2B" wr8, wr8, wr0 \n\t"
86 WAVG2B" wr10, wr10, wr2 \n\t"
87 "wstrd wr8, [%[block]] \n\t"
88 "add %[block], %[block], %[line_size] \n\t"
89 "wstrd wr10, [r5] \n\t"
90 "pld [%[block]] \n\t"
91 "pld [%[block], #32] \n\t"
92 "add r5, r5, %[line_size] \n\t"
93 "pld [r5] \n\t"
94 "pld [r5, #32] \n\t"
95 "bne 1b \n\t"
96 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
97 :
98 : "memory", "r4", "r5", "r12");
99 }
100
101 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
102 {
103 int stride = line_size;
104 __asm__ __volatile__ (
105 "and r12, %[pixels], #7 \n\t"
106 "bic %[pixels], %[pixels], #7 \n\t"
107 "tmcr wcgr1, r12 \n\t"
108 "add r4, %[pixels], %[line_size] \n\t"
109 "add r5, %[block], %[line_size] \n\t"
110 "mov %[line_size], %[line_size], lsl #1 \n\t"
111 "1: \n\t"
112 "wldrd wr0, [%[pixels]] \n\t"
113 "wldrd wr1, [%[pixels], #8] \n\t"
114 "subs %[h], %[h], #2 \n\t"
115 "wldrd wr2, [%[pixels], #16] \n\t"
116 "add %[pixels], %[pixels], %[line_size] \n\t"
117 "wldrd wr3, [r4] \n\t"
118 "pld [%[pixels]] \n\t"
119 "pld [%[pixels], #32] \n\t"
120 "walignr1 wr8, wr0, wr1 \n\t"
121 "wldrd wr4, [r4, #8] \n\t"
122 "walignr1 wr9, wr1, wr2 \n\t"
123 "wldrd wr5, [r4, #16] \n\t"
124 "add r4, r4, %[line_size] \n\t"
125 "pld [r4] \n\t"
126 "pld [r4, #32] \n\t"
127 "walignr1 wr10, wr3, wr4 \n\t"
128 "wstrd wr8, [%[block]] \n\t"
129 "walignr1 wr11, wr4, wr5 \n\t"
130 "wstrd wr9, [%[block], #8] \n\t"
131 "add %[block], %[block], %[line_size] \n\t"
132 "wstrd wr10, [r5] \n\t"
133 "wstrd wr11, [r5, #8] \n\t"
134 "add r5, r5, %[line_size] \n\t"
135 "bne 1b \n\t"
136 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
137 :
138 : "memory", "r4", "r5", "r12");
139 }
140
141 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
142 {
143 int stride = line_size;
144 __asm__ __volatile__ (
145 "pld [%[pixels]] \n\t"
146 "pld [%[pixels], #32] \n\t"
147 "pld [%[block]] \n\t"
148 "pld [%[block], #32] \n\t"
149 "and r12, %[pixels], #7 \n\t"
150 "bic %[pixels], %[pixels], #7 \n\t"
151 "tmcr wcgr1, r12 \n\t"
152 "add r4, %[pixels], %[line_size]\n\t"
153 "add r5, %[block], %[line_size] \n\t"
154 "mov %[line_size], %[line_size], lsl #1 \n\t"
155 "1: \n\t"
156 "wldrd wr0, [%[pixels]] \n\t"
157 "wldrd wr1, [%[pixels], #8] \n\t"
158 "subs %[h], %[h], #2 \n\t"
159 "wldrd wr2, [%[pixels], #16] \n\t"
160 "add %[pixels], %[pixels], %[line_size] \n\t"
161 "wldrd wr3, [r4] \n\t"
162 "pld [%[pixels]] \n\t"
163 "pld [%[pixels], #32] \n\t"
164 "walignr1 wr8, wr0, wr1 \n\t"
165 "wldrd wr4, [r4, #8] \n\t"
166 "walignr1 wr9, wr1, wr2 \n\t"
167 "wldrd wr5, [r4, #16] \n\t"
168 "add r4, r4, %[line_size] \n\t"
169 "wldrd wr0, [%[block]] \n\t"
170 "pld [r4] \n\t"
171 "wldrd wr1, [%[block], #8] \n\t"
172 "pld [r4, #32] \n\t"
173 "wldrd wr2, [r5] \n\t"
174 "walignr1 wr10, wr3, wr4 \n\t"
175 "wldrd wr3, [r5, #8] \n\t"
176 WAVG2B" wr8, wr8, wr0 \n\t"
177 WAVG2B" wr9, wr9, wr1 \n\t"
178 WAVG2B" wr10, wr10, wr2 \n\t"
179 "wstrd wr8, [%[block]] \n\t"
180 "walignr1 wr11, wr4, wr5 \n\t"
181 WAVG2B" wr11, wr11, wr3 \n\t"
182 "wstrd wr9, [%[block], #8] \n\t"
183 "add %[block], %[block], %[line_size] \n\t"
184 "wstrd wr10, [r5] \n\t"
185 "pld [%[block]] \n\t"
186 "pld [%[block], #32] \n\t"
187 "wstrd wr11, [r5, #8] \n\t"
188 "add r5, r5, %[line_size] \n\t"
189 "pld [r5] \n\t"
190 "pld [r5, #32] \n\t"
191 "bne 1b \n\t"
192 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
193 :
194 : "memory", "r4", "r5", "r12");
195 }
196
197 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
198 {
199 int stride = line_size;
200 // [wr0 wr1 wr2 wr3] for previous line
201 // [wr4 wr5 wr6 wr7] for current line
202 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
203 __asm__ __volatile__(
204 "pld [%[pixels]] \n\t"
205 "pld [%[pixels], #32] \n\t"
206 "and r12, %[pixels], #7 \n\t"
207 "bic %[pixels], %[pixels], #7 \n\t"
208 "tmcr wcgr1, r12 \n\t"
209 "add r12, r12, #1 \n\t"
210 "add r4, %[pixels], %[line_size]\n\t"
211 "tmcr wcgr2, r12 \n\t"
212 "add r5, %[block], %[line_size] \n\t"
213 "mov %[line_size], %[line_size], lsl #1 \n\t"
214
215 "1: \n\t"
216 "wldrd wr10, [%[pixels]] \n\t"
217 "cmp r12, #8 \n\t"
218 "wldrd wr11, [%[pixels], #8] \n\t"
219 "add %[pixels], %[pixels], %[line_size] \n\t"
220 "wldrd wr13, [r4] \n\t"
221 "pld [%[pixels]] \n\t"
222 "wldrd wr14, [r4, #8] \n\t"
223 "pld [%[pixels], #32] \n\t"
224 "add r4, r4, %[line_size] \n\t"
225 "walignr1 wr0, wr10, wr11 \n\t"
226 "pld [r4] \n\t"
227 "pld [r4, #32] \n\t"
228 "walignr1 wr2, wr13, wr14 \n\t"
229 "wmoveq wr4, wr11 \n\t"
230 "wmoveq wr6, wr14 \n\t"
231 "walignr2ne wr4, wr10, wr11 \n\t"
232 "walignr2ne wr6, wr13, wr14 \n\t"
233 WAVG2B" wr0, wr0, wr4 \n\t"
234 WAVG2B" wr2, wr2, wr6 \n\t"
235 "wstrd wr0, [%[block]] \n\t"
236 "subs %[h], %[h], #2 \n\t"
237 "wstrd wr2, [r5] \n\t"
238 "add %[block], %[block], %[line_size] \n\t"
239 "add r5, r5, %[line_size] \n\t"
240 "bne 1b \n\t"
241 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
242 :
243 : "r4", "r5", "r12", "memory");
244 }
245
246 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
247 {
248 int stride = line_size;
249 // [wr0 wr1 wr2 wr3] for previous line
250 // [wr4 wr5 wr6 wr7] for current line
251 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
252 __asm__ __volatile__(
253 "pld [%[pixels]] \n\t"
254 "pld [%[pixels], #32] \n\t"
255 "and r12, %[pixels], #7 \n\t"
256 "bic %[pixels], %[pixels], #7 \n\t"
257 "tmcr wcgr1, r12 \n\t"
258 "add r12, r12, #1 \n\t"
259 "add r4, %[pixels], %[line_size]\n\t"
260 "tmcr wcgr2, r12 \n\t"
261 "add r5, %[block], %[line_size] \n\t"
262 "mov %[line_size], %[line_size], lsl #1 \n\t"
263
264 "1: \n\t"
265 "wldrd wr10, [%[pixels]] \n\t"
266 "cmp r12, #8 \n\t"
267 "wldrd wr11, [%[pixels], #8] \n\t"
268 "wldrd wr12, [%[pixels], #16] \n\t"
269 "add %[pixels], %[pixels], %[line_size] \n\t"
270 "wldrd wr13, [r4] \n\t"
271 "pld [%[pixels]] \n\t"
272 "wldrd wr14, [r4, #8] \n\t"
273 "pld [%[pixels], #32] \n\t"
274 "wldrd wr15, [r4, #16] \n\t"
275 "add r4, r4, %[line_size] \n\t"
276 "walignr1 wr0, wr10, wr11 \n\t"
277 "pld [r4] \n\t"
278 "pld [r4, #32] \n\t"
279 "walignr1 wr1, wr11, wr12 \n\t"
280 "walignr1 wr2, wr13, wr14 \n\t"
281 "walignr1 wr3, wr14, wr15 \n\t"
282 "wmoveq wr4, wr11 \n\t"
283 "wmoveq wr5, wr12 \n\t"
284 "wmoveq wr6, wr14 \n\t"
285 "wmoveq wr7, wr15 \n\t"
286 "walignr2ne wr4, wr10, wr11 \n\t"
287 "walignr2ne wr5, wr11, wr12 \n\t"
288 "walignr2ne wr6, wr13, wr14 \n\t"
289 "walignr2ne wr7, wr14, wr15 \n\t"
290 WAVG2B" wr0, wr0, wr4 \n\t"
291 WAVG2B" wr1, wr1, wr5 \n\t"
292 "wstrd wr0, [%[block]] \n\t"
293 WAVG2B" wr2, wr2, wr6 \n\t"
294 "wstrd wr1, [%[block], #8] \n\t"
295 WAVG2B" wr3, wr3, wr7 \n\t"
296 "add %[block], %[block], %[line_size] \n\t"
297 "wstrd wr2, [r5] \n\t"
298 "subs %[h], %[h], #2 \n\t"
299 "wstrd wr3, [r5, #8] \n\t"
300 "add r5, r5, %[line_size] \n\t"
301 "bne 1b \n\t"
302 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
303 :
304 : "r4", "r5", "r12", "memory");
305 }
306
307 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
308 {
309 int stride = line_size;
310 // [wr0 wr1 wr2 wr3] for previous line
311 // [wr4 wr5 wr6 wr7] for current line
312 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
313 __asm__ __volatile__(
314 "pld [%[pixels]] \n\t"
315 "pld [%[pixels], #32] \n\t"
316 "pld [%[block]] \n\t"
317 "pld [%[block], #32] \n\t"
318 "and r12, %[pixels], #7 \n\t"
319 "bic %[pixels], %[pixels], #7 \n\t"
320 "tmcr wcgr1, r12 \n\t"
321 "add r12, r12, #1 \n\t"
322 "add r4, %[pixels], %[line_size]\n\t"
323 "tmcr wcgr2, r12 \n\t"
324 "add r5, %[block], %[line_size] \n\t"
325 "mov %[line_size], %[line_size], lsl #1 \n\t"
326 "pld [r5] \n\t"
327 "pld [r5, #32] \n\t"
328
329 "1: \n\t"
330 "wldrd wr10, [%[pixels]] \n\t"
331 "cmp r12, #8 \n\t"
332 "wldrd wr11, [%[pixels], #8] \n\t"
333 "add %[pixels], %[pixels], %[line_size] \n\t"
334 "wldrd wr13, [r4] \n\t"
335 "pld [%[pixels]] \n\t"
336 "wldrd wr14, [r4, #8] \n\t"
337 "pld [%[pixels], #32] \n\t"
338 "add r4, r4, %[line_size] \n\t"
339 "walignr1 wr0, wr10, wr11 \n\t"
340 "pld [r4] \n\t"
341 "pld [r4, #32] \n\t"
342 "walignr1 wr2, wr13, wr14 \n\t"
343 "wmoveq wr4, wr11 \n\t"
344 "wmoveq wr6, wr14 \n\t"
345 "walignr2ne wr4, wr10, wr11 \n\t"
346 "wldrd wr10, [%[block]] \n\t"
347 "walignr2ne wr6, wr13, wr14 \n\t"
348 "wldrd wr12, [r5] \n\t"
349 WAVG2B" wr0, wr0, wr4 \n\t"
350 WAVG2B" wr2, wr2, wr6 \n\t"
351 WAVG2B" wr0, wr0, wr10 \n\t"
352 WAVG2B" wr2, wr2, wr12 \n\t"
353 "wstrd wr0, [%[block]] \n\t"
354 "subs %[h], %[h], #2 \n\t"
355 "wstrd wr2, [r5] \n\t"
356 "add %[block], %[block], %[line_size] \n\t"
357 "add r5, r5, %[line_size] \n\t"
358 "pld [%[block]] \n\t"
359 "pld [%[block], #32] \n\t"
360 "pld [r5] \n\t"
361 "pld [r5, #32] \n\t"
362 "bne 1b \n\t"
363 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
364 :
365 : "r4", "r5", "r12", "memory");
366 }
367
368 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
369 {
370 int stride = line_size;
371 // [wr0 wr1 wr2 wr3] for previous line
372 // [wr4 wr5 wr6 wr7] for current line
373 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
374 __asm__ __volatile__(
375 "pld [%[pixels]] \n\t"
376 "pld [%[pixels], #32] \n\t"
377 "pld [%[block]] \n\t"
378 "pld [%[block], #32] \n\t"
379 "and r12, %[pixels], #7 \n\t"
380 "bic %[pixels], %[pixels], #7 \n\t"
381 "tmcr wcgr1, r12 \n\t"
382 "add r12, r12, #1 \n\t"
383 "add r4, %[pixels], %[line_size]\n\t"
384 "tmcr wcgr2, r12 \n\t"
385 "add r5, %[block], %[line_size] \n\t"
386 "mov %[line_size], %[line_size], lsl #1 \n\t"
387 "pld [r5] \n\t"
388 "pld [r5, #32] \n\t"
389
390 "1: \n\t"
391 "wldrd wr10, [%[pixels]] \n\t"
392 "cmp r12, #8 \n\t"
393 "wldrd wr11, [%[pixels], #8] \n\t"
394 "wldrd wr12, [%[pixels], #16] \n\t"
395 "add %[pixels], %[pixels], %[line_size] \n\t"
396 "wldrd wr13, [r4] \n\t"
397 "pld [%[pixels]] \n\t"
398 "wldrd wr14, [r4, #8] \n\t"
399 "pld [%[pixels], #32] \n\t"
400 "wldrd wr15, [r4, #16] \n\t"
401 "add r4, r4, %[line_size] \n\t"
402 "walignr1 wr0, wr10, wr11 \n\t"
403 "pld [r4] \n\t"
404 "pld [r4, #32] \n\t"
405 "walignr1 wr1, wr11, wr12 \n\t"
406 "walignr1 wr2, wr13, wr14 \n\t"
407 "walignr1 wr3, wr14, wr15 \n\t"
408 "wmoveq wr4, wr11 \n\t"
409 "wmoveq wr5, wr12 \n\t"
410 "wmoveq wr6, wr14 \n\t"
411 "wmoveq wr7, wr15 \n\t"
412 "walignr2ne wr4, wr10, wr11 \n\t"
413 "walignr2ne wr5, wr11, wr12 \n\t"
414 "walignr2ne wr6, wr13, wr14 \n\t"
415 "walignr2ne wr7, wr14, wr15 \n\t"
416 "wldrd wr10, [%[block]] \n\t"
417 WAVG2B" wr0, wr0, wr4 \n\t"
418 "wldrd wr11, [%[block], #8] \n\t"
419 WAVG2B" wr1, wr1, wr5 \n\t"
420 "wldrd wr12, [r5] \n\t"
421 WAVG2B" wr2, wr2, wr6 \n\t"
422 "wldrd wr13, [r5, #8] \n\t"
423 WAVG2B" wr3, wr3, wr7 \n\t"
424 WAVG2B" wr0, wr0, wr10 \n\t"
425 WAVG2B" wr1, wr1, wr11 \n\t"
426 WAVG2B" wr2, wr2, wr12 \n\t"
427 WAVG2B" wr3, wr3, wr13 \n\t"
428 "wstrd wr0, [%[block]] \n\t"
429 "subs %[h], %[h], #2 \n\t"
430 "wstrd wr1, [%[block], #8] \n\t"
431 "add %[block], %[block], %[line_size] \n\t"
432 "wstrd wr2, [r5] \n\t"
433 "pld [%[block]] \n\t"
434 "wstrd wr3, [r5, #8] \n\t"
435 "add r5, r5, %[line_size] \n\t"
436 "pld [%[block], #32] \n\t"
437 "pld [r5] \n\t"
438 "pld [r5, #32] \n\t"
439 "bne 1b \n\t"
440 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
441 :
442 :"r4", "r5", "r12", "memory");
443 }
444
445 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
446 {
447 int stride = line_size;
448 // [wr0 wr1 wr2 wr3] for previous line
449 // [wr4 wr5 wr6 wr7] for current line
450 __asm__ __volatile__(
451 "pld [%[pixels]] \n\t"
452 "pld [%[pixels], #32] \n\t"
453 "and r12, %[pixels], #7 \n\t"
454 "tmcr wcgr1, r12 \n\t"
455 "bic %[pixels], %[pixels], #7 \n\t"
456
457 "wldrd wr10, [%[pixels]] \n\t"
458 "wldrd wr11, [%[pixels], #8] \n\t"
459 "pld [%[block]] \n\t"
460 "add %[pixels], %[pixels], %[line_size] \n\t"
461 "walignr1 wr0, wr10, wr11 \n\t"
462 "pld [%[pixels]] \n\t"
463 "pld [%[pixels], #32] \n\t"
464
465 "1: \n\t"
466 "wldrd wr10, [%[pixels]] \n\t"
467 "wldrd wr11, [%[pixels], #8] \n\t"
468 "add %[pixels], %[pixels], %[line_size] \n\t"
469 "pld [%[pixels]] \n\t"
470 "pld [%[pixels], #32] \n\t"
471 "walignr1 wr4, wr10, wr11 \n\t"
472 "wldrd wr10, [%[block]] \n\t"
473 WAVG2B" wr8, wr0, wr4 \n\t"
474 WAVG2B" wr8, wr8, wr10 \n\t"
475 "wstrd wr8, [%[block]] \n\t"
476 "add %[block], %[block], %[line_size] \n\t"
477
478 "wldrd wr10, [%[pixels]] \n\t"
479 "wldrd wr11, [%[pixels], #8] \n\t"
480 "pld [%[block]] \n\t"
481 "add %[pixels], %[pixels], %[line_size] \n\t"
482 "pld [%[pixels]] \n\t"
483 "pld [%[pixels], #32] \n\t"
484 "walignr1 wr0, wr10, wr11 \n\t"
485 "wldrd wr10, [%[block]] \n\t"
486 WAVG2B" wr8, wr0, wr4 \n\t"
487 WAVG2B" wr8, wr8, wr10 \n\t"
488 "wstrd wr8, [%[block]] \n\t"
489 "add %[block], %[block], %[line_size] \n\t"
490
491 "subs %[h], %[h], #2 \n\t"
492 "pld [%[block]] \n\t"
493 "bne 1b \n\t"
494 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
495 :
496 : "cc", "memory", "r12");
497 }
498
499 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
500 {
501 int stride = line_size;
502 // [wr0 wr1 wr2 wr3] for previous line
503 // [wr4 wr5 wr6 wr7] for current line
504 __asm__ __volatile__(
505 "pld [%[pixels]] \n\t"
506 "pld [%[pixels], #32] \n\t"
507 "and r12, %[pixels], #7 \n\t"
508 "tmcr wcgr1, r12 \n\t"
509 "bic %[pixels], %[pixels], #7 \n\t"
510
511 "wldrd wr10, [%[pixels]] \n\t"
512 "wldrd wr11, [%[pixels], #8] \n\t"
513 "wldrd wr12, [%[pixels], #16] \n\t"
514 "add %[pixels], %[pixels], %[line_size] \n\t"
515 "pld [%[pixels]] \n\t"
516 "pld [%[pixels], #32] \n\t"
517 "walignr1 wr0, wr10, wr11 \n\t"
518 "walignr1 wr1, wr11, wr12 \n\t"
519
520 "1: \n\t"
521 "wldrd wr10, [%[pixels]] \n\t"
522 "wldrd wr11, [%[pixels], #8] \n\t"
523 "wldrd wr12, [%[pixels], #16] \n\t"
524 "add %[pixels], %[pixels], %[line_size] \n\t"
525 "pld [%[pixels]] \n\t"
526 "pld [%[pixels], #32] \n\t"
527 "walignr1 wr4, wr10, wr11 \n\t"
528 "walignr1 wr5, wr11, wr12 \n\t"
529 WAVG2B" wr8, wr0, wr4 \n\t"
530 WAVG2B" wr9, wr1, wr5 \n\t"
531 "wstrd wr8, [%[block]] \n\t"
532 "wstrd wr9, [%[block], #8] \n\t"
533 "add %[block], %[block], %[line_size] \n\t"
534
535 "wldrd wr10, [%[pixels]] \n\t"
536 "wldrd wr11, [%[pixels], #8] \n\t"
537 "wldrd wr12, [%[pixels], #16] \n\t"
538 "add %[pixels], %[pixels], %[line_size] \n\t"
539 "pld [%[pixels]] \n\t"
540 "pld [%[pixels], #32] \n\t"
541 "walignr1 wr0, wr10, wr11 \n\t"
542 "walignr1 wr1, wr11, wr12 \n\t"
543 WAVG2B" wr8, wr0, wr4 \n\t"
544 WAVG2B" wr9, wr1, wr5 \n\t"
545 "wstrd wr8, [%[block]] \n\t"
546 "wstrd wr9, [%[block], #8] \n\t"
547 "add %[block], %[block], %[line_size] \n\t"
548
549 "subs %[h], %[h], #2 \n\t"
550 "bne 1b \n\t"
551 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
552 :
553 : "r4", "r5", "r12", "memory");
554 }
555
556 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
557 {
558 int stride = line_size;
559 // [wr0 wr1 wr2 wr3] for previous line
560 // [wr4 wr5 wr6 wr7] for current line
561 __asm__ __volatile__(
562 "pld [%[pixels]] \n\t"
563 "pld [%[pixels], #32] \n\t"
564 "and r12, %[pixels], #7 \n\t"
565 "tmcr wcgr1, r12 \n\t"
566 "bic %[pixels], %[pixels], #7 \n\t"
567
568 "wldrd wr10, [%[pixels]] \n\t"
569 "wldrd wr11, [%[pixels], #8] \n\t"
570 "pld [%[block]] \n\t"
571 "wldrd wr12, [%[pixels], #16] \n\t"
572 "add %[pixels], %[pixels], %[line_size] \n\t"
573 "pld [%[pixels]] \n\t"
574 "pld [%[pixels], #32] \n\t"
575 "walignr1 wr0, wr10, wr11 \n\t"
576 "walignr1 wr1, wr11, wr12 \n\t"
577
578 "1: \n\t"
579 "wldrd wr10, [%[pixels]] \n\t"
580 "wldrd wr11, [%[pixels], #8] \n\t"
581 "wldrd wr12, [%[pixels], #16] \n\t"
582 "add %[pixels], %[pixels], %[line_size] \n\t"
583 "pld [%[pixels]] \n\t"
584 "pld [%[pixels], #32] \n\t"
585 "walignr1 wr4, wr10, wr11 \n\t"
586 "walignr1 wr5, wr11, wr12 \n\t"
587 "wldrd wr10, [%[block]] \n\t"
588 "wldrd wr11, [%[block], #8] \n\t"
589 WAVG2B" wr8, wr0, wr4 \n\t"
590 WAVG2B" wr9, wr1, wr5 \n\t"
591 WAVG2B" wr8, wr8, wr10 \n\t"
592 WAVG2B" wr9, wr9, wr11 \n\t"
593 "wstrd wr8, [%[block]] \n\t"
594 "wstrd wr9, [%[block], #8] \n\t"
595 "add %[block], %[block], %[line_size] \n\t"
596
597 "wldrd wr10, [%[pixels]] \n\t"
598 "wldrd wr11, [%[pixels], #8] \n\t"
599 "pld [%[block]] \n\t"
600 "wldrd wr12, [%[pixels], #16] \n\t"
601 "add %[pixels], %[pixels], %[line_size] \n\t"
602 "pld [%[pixels]] \n\t"
603 "pld [%[pixels], #32] \n\t"
604 "walignr1 wr0, wr10, wr11 \n\t"
605 "walignr1 wr1, wr11, wr12 \n\t"
606 "wldrd wr10, [%[block]] \n\t"
607 "wldrd wr11, [%[block], #8] \n\t"
608 WAVG2B" wr8, wr0, wr4 \n\t"
609 WAVG2B" wr9, wr1, wr5 \n\t"
610 WAVG2B" wr8, wr8, wr10 \n\t"
611 WAVG2B" wr9, wr9, wr11 \n\t"
612 "wstrd wr8, [%[block]] \n\t"
613 "wstrd wr9, [%[block], #8] \n\t"
614 "add %[block], %[block], %[line_size] \n\t"
615
616 "subs %[h], %[h], #2 \n\t"
617 "pld [%[block]] \n\t"
618 "bne 1b \n\t"
619 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
620 :
621 : "r4", "r5", "r12", "memory");
622 }
623
624 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
625 {
626 // [wr0 wr1 wr2 wr3] for previous line
627 // [wr4 wr5 wr6 wr7] for current line
628 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
629 __asm__ __volatile__(
630 "pld [%[pixels]] \n\t"
631 "mov r12, #2 \n\t"
632 "pld [%[pixels], #32] \n\t"
633 "tmcr wcgr0, r12 \n\t" /* for shift value */
634 "and r12, %[pixels], #7 \n\t"
635 "bic %[pixels], %[pixels], #7 \n\t"
636 "tmcr wcgr1, r12 \n\t"
637
638 // [wr0 wr1 wr2 wr3] <= *
639 // [wr4 wr5 wr6 wr7]
640 "wldrd wr12, [%[pixels]] \n\t"
641 "add r12, r12, #1 \n\t"
642 "wldrd wr13, [%[pixels], #8] \n\t"
643 "tmcr wcgr2, r12 \n\t"
644 "add %[pixels], %[pixels], %[line_size] \n\t"
645 "cmp r12, #8 \n\t"
646 "pld [%[pixels]] \n\t"
647 "pld [%[pixels], #32] \n\t"
648 "walignr1 wr2, wr12, wr13 \n\t"
649 "wmoveq wr10, wr13 \n\t"
650 "walignr2ne wr10, wr12, wr13 \n\t"
651 "wunpckelub wr0, wr2 \n\t"
652 "wunpckehub wr1, wr2 \n\t"
653 "wunpckelub wr8, wr10 \n\t"
654 "wunpckehub wr9, wr10 \n\t"
655 "waddhus wr0, wr0, wr8 \n\t"
656 "waddhus wr1, wr1, wr9 \n\t"
657
658 "1: \n\t"
659 // [wr0 wr1 wr2 wr3]
660 // [wr4 wr5 wr6 wr7] <= *
661 "wldrd wr12, [%[pixels]] \n\t"
662 "cmp r12, #8 \n\t"
663 "wldrd wr13, [%[pixels], #8] \n\t"
664 "add %[pixels], %[pixels], %[line_size] \n\t"
665 "walignr1 wr6, wr12, wr13 \n\t"
666 "pld [%[pixels]] \n\t"
667 "pld [%[pixels], #32] \n\t"
668 "wmoveq wr10, wr13 \n\t"
669 "walignr2ne wr10, wr12, wr13 \n\t"
670 "wunpckelub wr4, wr6 \n\t"
671 "wunpckehub wr5, wr6 \n\t"
672 "wunpckelub wr8, wr10 \n\t"
673 "wunpckehub wr9, wr10 \n\t"
674 "waddhus wr4, wr4, wr8 \n\t"
675 "waddhus wr5, wr5, wr9 \n\t"
676 "waddhus wr8, wr0, wr4 \n\t"
677 "waddhus wr9, wr1, wr5 \n\t"
678 "waddhus wr8, wr8, wr15 \n\t"
679 "waddhus wr9, wr9, wr15 \n\t"
680 "wsrlhg wr8, wr8, wcgr0 \n\t"
681 "wsrlhg wr9, wr9, wcgr0 \n\t"
682 "wpackhus wr8, wr8, wr9 \n\t"
683 "wstrd wr8, [%[block]] \n\t"
684 "add %[block], %[block], %[line_size] \n\t"
685
686 // [wr0 wr1 wr2 wr3] <= *
687 // [wr4 wr5 wr6 wr7]
688 "wldrd wr12, [%[pixels]] \n\t"
689 "wldrd wr13, [%[pixels], #8] \n\t"
690 "add %[pixels], %[pixels], %[line_size] \n\t"
691 "walignr1 wr2, wr12, wr13 \n\t"
692 "pld [%[pixels]] \n\t"
693 "pld [%[pixels], #32] \n\t"
694 "wmoveq wr10, wr13 \n\t"
695 "walignr2ne wr10, wr12, wr13 \n\t"
696 "wunpckelub wr0, wr2 \n\t"
697 "wunpckehub wr1, wr2 \n\t"
698 "wunpckelub wr8, wr10 \n\t"
699 "wunpckehub wr9, wr10 \n\t"
700 "waddhus wr0, wr0, wr8 \n\t"
701 "waddhus wr1, wr1, wr9 \n\t"
702 "waddhus wr8, wr0, wr4 \n\t"
703 "waddhus wr9, wr1, wr5 \n\t"
704 "waddhus wr8, wr8, wr15 \n\t"
705 "waddhus wr9, wr9, wr15 \n\t"
706 "wsrlhg wr8, wr8, wcgr0 \n\t"
707 "wsrlhg wr9, wr9, wcgr0 \n\t"
708 "wpackhus wr8, wr8, wr9 \n\t"
709 "subs %[h], %[h], #2 \n\t"
710 "wstrd wr8, [%[block]] \n\t"
711 "add %[block], %[block], %[line_size] \n\t"
712 "bne 1b \n\t"
713 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
714 : [line_size]"r"(line_size)
715 : "r12", "memory");
716 }
717
718 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
719 {
720 // [wr0 wr1 wr2 wr3] for previous line
721 // [wr4 wr5 wr6 wr7] for current line
722 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
723 __asm__ __volatile__(
724 "pld [%[pixels]] \n\t"
725 "mov r12, #2 \n\t"
726 "pld [%[pixels], #32] \n\t"
727 "tmcr wcgr0, r12 \n\t" /* for shift value */
728 /* alignment */
729 "and r12, %[pixels], #7 \n\t"
730 "bic %[pixels], %[pixels], #7 \n\t"
731 "tmcr wcgr1, r12 \n\t"
732 "add r12, r12, #1 \n\t"
733 "tmcr wcgr2, r12 \n\t"
734
735 // [wr0 wr1 wr2 wr3] <= *
736 // [wr4 wr5 wr6 wr7]
737 "wldrd wr12, [%[pixels]] \n\t"
738 "cmp r12, #8 \n\t"
739 "wldrd wr13, [%[pixels], #8] \n\t"
740 "wldrd wr14, [%[pixels], #16] \n\t"
741 "add %[pixels], %[pixels], %[line_size] \n\t"
742 "pld [%[pixels]] \n\t"
743 "walignr1 wr2, wr12, wr13 \n\t"
744 "pld [%[pixels], #32] \n\t"
745 "walignr1 wr3, wr13, wr14 \n\t"
746 "wmoveq wr10, wr13 \n\t"
747 "wmoveq wr11, wr14 \n\t"
748 "walignr2ne wr10, wr12, wr13 \n\t"
749 "walignr2ne wr11, wr13, wr14 \n\t"
750 "wunpckelub wr0, wr2 \n\t"
751 "wunpckehub wr1, wr2 \n\t"
752 "wunpckelub wr2, wr3 \n\t"
753 "wunpckehub wr3, wr3 \n\t"
754 "wunpckelub wr8, wr10 \n\t"
755 "wunpckehub wr9, wr10 \n\t"
756 "wunpckelub wr10, wr11 \n\t"
757 "wunpckehub wr11, wr11 \n\t"
758 "waddhus wr0, wr0, wr8 \n\t"
759 "waddhus wr1, wr1, wr9 \n\t"
760 "waddhus wr2, wr2, wr10 \n\t"
761 "waddhus wr3, wr3, wr11 \n\t"
762
763 "1: \n\t"
764 // [wr0 wr1 wr2 wr3]
765 // [wr4 wr5 wr6 wr7] <= *
766 "wldrd wr12, [%[pixels]] \n\t"
767 "cmp r12, #8 \n\t"
768 "wldrd wr13, [%[pixels], #8] \n\t"
769 "wldrd wr14, [%[pixels], #16] \n\t"
770 "add %[pixels], %[pixels], %[line_size] \n\t"
771 "walignr1 wr6, wr12, wr13 \n\t"
772 "pld [%[pixels]] \n\t"
773 "pld [%[pixels], #32] \n\t"
774 "walignr1 wr7, wr13, wr14 \n\t"
775 "wmoveq wr10, wr13 \n\t"
776 "wmoveq wr11, wr14 \n\t"
777 "walignr2ne wr10, wr12, wr13 \n\t"
778 "walignr2ne wr11, wr13, wr14 \n\t"
779 "wunpckelub wr4, wr6 \n\t"
780 "wunpckehub wr5, wr6 \n\t"
781 "wunpckelub wr6, wr7 \n\t"
782 "wunpckehub wr7, wr7 \n\t"
783 "wunpckelub wr8, wr10 \n\t"
784 "wunpckehub wr9, wr10 \n\t"
785 "wunpckelub wr10, wr11 \n\t"
786 "wunpckehub wr11, wr11 \n\t"
787 "waddhus wr4, wr4, wr8 \n\t"
788 "waddhus wr5, wr5, wr9 \n\t"
789 "waddhus wr6, wr6, wr10 \n\t"
790 "waddhus wr7, wr7, wr11 \n\t"
791 "waddhus wr8, wr0, wr4 \n\t"
792 "waddhus wr9, wr1, wr5 \n\t"
793 "waddhus wr10, wr2, wr6 \n\t"
794 "waddhus wr11, wr3, wr7 \n\t"
795 "waddhus wr8, wr8, wr15 \n\t"
796 "waddhus wr9, wr9, wr15 \n\t"
797 "waddhus wr10, wr10, wr15 \n\t"
798 "waddhus wr11, wr11, wr15 \n\t"
799 "wsrlhg wr8, wr8, wcgr0 \n\t"
800 "wsrlhg wr9, wr9, wcgr0 \n\t"
801 "wsrlhg wr10, wr10, wcgr0 \n\t"
802 "wsrlhg wr11, wr11, wcgr0 \n\t"
803 "wpackhus wr8, wr8, wr9 \n\t"
804 "wpackhus wr9, wr10, wr11 \n\t"
805 "wstrd wr8, [%[block]] \n\t"
806 "wstrd wr9, [%[block], #8] \n\t"
807 "add %[block], %[block], %[line_size] \n\t"
808
809 // [wr0 wr1 wr2 wr3] <= *
810 // [wr4 wr5 wr6 wr7]
811 "wldrd wr12, [%[pixels]] \n\t"
812 "wldrd wr13, [%[pixels], #8] \n\t"
813 "wldrd wr14, [%[pixels], #16] \n\t"
814 "add %[pixels], %[pixels], %[line_size] \n\t"
815 "walignr1 wr2, wr12, wr13 \n\t"
816 "pld [%[pixels]] \n\t"
817 "pld [%[pixels], #32] \n\t"
818 "walignr1 wr3, wr13, wr14 \n\t"
819 "wmoveq wr10, wr13 \n\t"
820 "wmoveq wr11, wr14 \n\t"
821 "walignr2ne wr10, wr12, wr13 \n\t"
822 "walignr2ne wr11, wr13, wr14 \n\t"
823 "wunpckelub wr0, wr2 \n\t"
824 "wunpckehub wr1, wr2 \n\t"
825 "wunpckelub wr2, wr3 \n\t"
826 "wunpckehub wr3, wr3 \n\t"
827 "wunpckelub wr8, wr10 \n\t"
828 "wunpckehub wr9, wr10 \n\t"
829 "wunpckelub wr10, wr11 \n\t"
830 "wunpckehub wr11, wr11 \n\t"
831 "waddhus wr0, wr0, wr8 \n\t"
832 "waddhus wr1, wr1, wr9 \n\t"
833 "waddhus wr2, wr2, wr10 \n\t"
834 "waddhus wr3, wr3, wr11 \n\t"
835 "waddhus wr8, wr0, wr4 \n\t"
836 "waddhus wr9, wr1, wr5 \n\t"
837 "waddhus wr10, wr2, wr6 \n\t"
838 "waddhus wr11, wr3, wr7 \n\t"
839 "waddhus wr8, wr8, wr15 \n\t"
840 "waddhus wr9, wr9, wr15 \n\t"
841 "waddhus wr10, wr10, wr15 \n\t"
842 "waddhus wr11, wr11, wr15 \n\t"
843 "wsrlhg wr8, wr8, wcgr0 \n\t"
844 "wsrlhg wr9, wr9, wcgr0 \n\t"
845 "wsrlhg wr10, wr10, wcgr0 \n\t"
846 "wsrlhg wr11, wr11, wcgr0 \n\t"
847 "wpackhus wr8, wr8, wr9 \n\t"
848 "wpackhus wr9, wr10, wr11 \n\t"
849 "wstrd wr8, [%[block]] \n\t"
850 "wstrd wr9, [%[block], #8] \n\t"
851 "add %[block], %[block], %[line_size] \n\t"
852
853 "subs %[h], %[h], #2 \n\t"
854 "bne 1b \n\t"
855 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
856 : [line_size]"r"(line_size)
857 : "r12", "memory");
858 }
859
860 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
861 {
862 // [wr0 wr1 wr2 wr3] for previous line
863 // [wr4 wr5 wr6 wr7] for current line
864 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
865 __asm__ __volatile__(
866 "pld [%[block]] \n\t"
867 "pld [%[block], #32] \n\t"
868 "pld [%[pixels]] \n\t"
869 "mov r12, #2 \n\t"
870 "pld [%[pixels], #32] \n\t"
871 "tmcr wcgr0, r12 \n\t" /* for shift value */
872 "and r12, %[pixels], #7 \n\t"
873 "bic %[pixels], %[pixels], #7 \n\t"
874 "tmcr wcgr1, r12 \n\t"
875
876 // [wr0 wr1 wr2 wr3] <= *
877 // [wr4 wr5 wr6 wr7]
878 "wldrd wr12, [%[pixels]] \n\t"
879 "add r12, r12, #1 \n\t"
880 "wldrd wr13, [%[pixels], #8] \n\t"
881 "tmcr wcgr2, r12 \n\t"
882 "add %[pixels], %[pixels], %[line_size] \n\t"
883 "cmp r12, #8 \n\t"
884 "pld [%[pixels]] \n\t"
885 "pld [%[pixels], #32] \n\t"
886 "walignr1 wr2, wr12, wr13 \n\t"
887 "wmoveq wr10, wr13 \n\t"
888 "walignr2ne wr10, wr12, wr13 \n\t"
889 "wunpckelub wr0, wr2 \n\t"
890 "wunpckehub wr1, wr2 \n\t"
891 "wunpckelub wr8, wr10 \n\t"
892 "wunpckehub wr9, wr10 \n\t"
893 "waddhus wr0, wr0, wr8 \n\t"
894 "waddhus wr1, wr1, wr9 \n\t"
895
896 "1: \n\t"
897 // [wr0 wr1 wr2 wr3]
898 // [wr4 wr5 wr6 wr7] <= *
899 "wldrd wr12, [%[pixels]] \n\t"
900 "cmp r12, #8 \n\t"
901 "wldrd wr13, [%[pixels], #8] \n\t"
902 "add %[pixels], %[pixels], %[line_size] \n\t"
903 "walignr1 wr6, wr12, wr13 \n\t"
904 "pld [%[pixels]] \n\t"
905 "pld [%[pixels], #32] \n\t"
906 "wmoveq wr10, wr13 \n\t"
907 "walignr2ne wr10, wr12, wr13 \n\t"
908 "wunpckelub wr4, wr6 \n\t"
909 "wunpckehub wr5, wr6 \n\t"
910 "wunpckelub wr8, wr10 \n\t"
911 "wunpckehub wr9, wr10 \n\t"
912 "waddhus wr4, wr4, wr8 \n\t"
913 "waddhus wr5, wr5, wr9 \n\t"
914 "waddhus wr8, wr0, wr4 \n\t"
915 "waddhus wr9, wr1, wr5 \n\t"
916 "waddhus wr8, wr8, wr15 \n\t"
917 "waddhus wr9, wr9, wr15 \n\t"
918 "wldrd wr12, [%[block]] \n\t"
919 "wsrlhg wr8, wr8, wcgr0 \n\t"
920 "wsrlhg wr9, wr9, wcgr0 \n\t"
921 "wpackhus wr8, wr8, wr9 \n\t"
922 WAVG2B" wr8, wr8, wr12 \n\t"
923 "wstrd wr8, [%[block]] \n\t"
924 "add %[block], %[block], %[line_size] \n\t"
925 "wldrd wr12, [%[pixels]] \n\t"
926 "pld [%[block]] \n\t"
927 "pld [%[block], #32] \n\t"
928
929 // [wr0 wr1 wr2 wr3] <= *
930 // [wr4 wr5 wr6 wr7]
931 "wldrd wr13, [%[pixels], #8] \n\t"
932 "add %[pixels], %[pixels], %[line_size] \n\t"
933 "walignr1 wr2, wr12, wr13 \n\t"
934 "pld [%[pixels]] \n\t"
935 "pld [%[pixels], #32] \n\t"
936 "wmoveq wr10, wr13 \n\t"
937 "walignr2ne wr10, wr12, wr13 \n\t"
938 "wunpckelub wr0, wr2 \n\t"
939 "wunpckehub wr1, wr2 \n\t"
940 "wunpckelub wr8, wr10 \n\t"
941 "wunpckehub wr9, wr10 \n\t"
942 "waddhus wr0, wr0, wr8 \n\t"
943 "waddhus wr1, wr1, wr9 \n\t"
944 "waddhus wr8, wr0, wr4 \n\t"
945 "waddhus wr9, wr1, wr5 \n\t"
946 "waddhus wr8, wr8, wr15 \n\t"
947 "waddhus wr9, wr9, wr15 \n\t"
948 "wldrd wr12, [%[block]] \n\t"
949 "wsrlhg wr8, wr8, wcgr0 \n\t"
950 "wsrlhg wr9, wr9, wcgr0 \n\t"
951 "wpackhus wr8, wr8, wr9 \n\t"
952 "subs %[h], %[h], #2 \n\t"
953 WAVG2B" wr8, wr8, wr12 \n\t"
954 "wstrd wr8, [%[block]] \n\t"
955 "add %[block], %[block], %[line_size] \n\t"
956 "pld [%[block]] \n\t"
957 "pld [%[block], #32] \n\t"
958 "bne 1b \n\t"
959 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
960 : [line_size]"r"(line_size)
961 : "r12", "memory");
962 }
963
964 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
965 {
966 // [wr0 wr1 wr2 wr3] for previous line
967 // [wr4 wr5 wr6 wr7] for current line
968 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
969 __asm__ __volatile__(
970 "pld [%[block]] \n\t"
971 "pld [%[block], #32] \n\t"
972 "pld [%[pixels]] \n\t"
973 "mov r12, #2 \n\t"
974 "pld [%[pixels], #32] \n\t"
975 "tmcr wcgr0, r12 \n\t" /* for shift value */
976 /* alignment */
977 "and r12, %[pixels], #7 \n\t"
978 "bic %[pixels], %[pixels], #7 \n\t"
979 "tmcr wcgr1, r12 \n\t"
980 "add r12, r12, #1 \n\t"
981 "tmcr wcgr2, r12 \n\t"
982
983 // [wr0 wr1 wr2 wr3] <= *
984 // [wr4 wr5 wr6 wr7]
985 "wldrd wr12, [%[pixels]] \n\t"
986 "cmp r12, #8 \n\t"
987 "wldrd wr13, [%[pixels], #8] \n\t"
988 "wldrd wr14, [%[pixels], #16] \n\t"
989 "add %[pixels], %[pixels], %[line_size] \n\t"
990 "pld [%[pixels]] \n\t"
991 "walignr1 wr2, wr12, wr13 \n\t"
992 "pld [%[pixels], #32] \n\t"
993 "walignr1 wr3, wr13, wr14 \n\t"
994 "wmoveq wr10, wr13 \n\t"
995 "wmoveq wr11, wr14 \n\t"
996 "walignr2ne wr10, wr12, wr13 \n\t"
997 "walignr2ne wr11, wr13, wr14 \n\t"
998 "wunpckelub wr0, wr2 \n\t"
999 "wunpckehub wr1, wr2 \n\t"
1000 "wunpckelub wr2, wr3 \n\t"
1001 "wunpckehub wr3, wr3 \n\t"
1002 "wunpckelub wr8, wr10 \n\t"
1003 "wunpckehub wr9, wr10 \n\t"
1004 "wunpckelub wr10, wr11 \n\t"
1005 "wunpckehub wr11, wr11 \n\t"
1006 "waddhus wr0, wr0, wr8 \n\t"
1007 "waddhus wr1, wr1, wr9 \n\t"
1008 "waddhus wr2, wr2, wr10 \n\t"
1009 "waddhus wr3, wr3, wr11 \n\t"
1010
1011 "1: \n\t"
1012 // [wr0 wr1 wr2 wr3]
1013 // [wr4 wr5 wr6 wr7] <= *
1014 "wldrd wr12, [%[pixels]] \n\t"
1015 "cmp r12, #8 \n\t"
1016 "wldrd wr13, [%[pixels], #8] \n\t"
1017 "wldrd wr14, [%[pixels], #16] \n\t"
1018 "add %[pixels], %[pixels], %[line_size] \n\t"
1019 "walignr1 wr6, wr12, wr13 \n\t"
1020 "pld [%[pixels]] \n\t"
1021 "pld [%[pixels], #32] \n\t"
1022 "walignr1 wr7, wr13, wr14 \n\t"
1023 "wmoveq wr10, wr13 \n\t"
1024 "wmoveq wr11, wr14 \n\t"
1025 "walignr2ne wr10, wr12, wr13 \n\t"
1026 "walignr2ne wr11, wr13, wr14 \n\t"
1027 "wunpckelub wr4, wr6 \n\t"
1028 "wunpckehub wr5, wr6 \n\t"
1029 "wunpckelub wr6, wr7 \n\t"
1030 "wunpckehub wr7, wr7 \n\t"
1031 "wunpckelub wr8, wr10 \n\t"
1032 "wunpckehub wr9, wr10 \n\t"
1033 "wunpckelub wr10, wr11 \n\t"
1034 "wunpckehub wr11, wr11 \n\t"
1035 "waddhus wr4, wr4, wr8 \n\t"
1036 "waddhus wr5, wr5, wr9 \n\t"
1037 "waddhus wr6, wr6, wr10 \n\t"
1038 "waddhus wr7, wr7, wr11 \n\t"
1039 "waddhus wr8, wr0, wr4 \n\t"
1040 "waddhus wr9, wr1, wr5 \n\t"
1041 "waddhus wr10, wr2, wr6 \n\t"
1042 "waddhus wr11, wr3, wr7 \n\t"
1043 "waddhus wr8, wr8, wr15 \n\t"
1044 "waddhus wr9, wr9, wr15 \n\t"
1045 "waddhus wr10, wr10, wr15 \n\t"
1046 "waddhus wr11, wr11, wr15 \n\t"
1047 "wsrlhg wr8, wr8, wcgr0 \n\t"
1048 "wsrlhg wr9, wr9, wcgr0 \n\t"
1049 "wldrd wr12, [%[block]] \n\t"
1050 "wldrd wr13, [%[block], #8] \n\t"
1051 "wsrlhg wr10, wr10, wcgr0 \n\t"
1052 "wsrlhg wr11, wr11, wcgr0 \n\t"
1053 "wpackhus wr8, wr8, wr9 \n\t"
1054 "wpackhus wr9, wr10, wr11 \n\t"
1055 WAVG2B" wr8, wr8, wr12 \n\t"
1056 WAVG2B" wr9, wr9, wr13 \n\t"
1057 "wstrd wr8, [%[block]] \n\t"
1058 "wstrd wr9, [%[block], #8] \n\t"
1059 "add %[block], %[block], %[line_size] \n\t"
1060
1061 // [wr0 wr1 wr2 wr3] <= *
1062 // [wr4 wr5 wr6 wr7]
1063 "wldrd wr12, [%[pixels]] \n\t"
1064 "pld [%[block]] \n\t"
1065 "wldrd wr13, [%[pixels], #8] \n\t"
1066 "pld [%[block], #32] \n\t"
1067 "wldrd wr14, [%[pixels], #16] \n\t"
1068 "add %[pixels], %[pixels], %[line_size] \n\t"
1069 "walignr1 wr2, wr12, wr13 \n\t"
1070 "pld [%[pixels]] \n\t"
1071 "pld [%[pixels], #32] \n\t"
1072 "walignr1 wr3, wr13, wr14 \n\t"
1073 "wmoveq wr10, wr13 \n\t"
1074 "wmoveq wr11, wr14 \n\t"
1075 "walignr2ne wr10, wr12, wr13 \n\t"
1076 "walignr2ne wr11, wr13, wr14 \n\t"
1077 "wunpckelub wr0, wr2 \n\t"
1078 "wunpckehub wr1, wr2 \n\t"
1079 "wunpckelub wr2, wr3 \n\t"
1080 "wunpckehub wr3, wr3 \n\t"
1081 "wunpckelub wr8, wr10 \n\t"
1082 "wunpckehub wr9, wr10 \n\t"
1083 "wunpckelub wr10, wr11 \n\t"
1084 "wunpckehub wr11, wr11 \n\t"
1085 "waddhus wr0, wr0, wr8 \n\t"
1086 "waddhus wr1, wr1, wr9 \n\t"
1087 "waddhus wr2, wr2, wr10 \n\t"
1088 "waddhus wr3, wr3, wr11 \n\t"
1089 "waddhus wr8, wr0, wr4 \n\t"
1090 "waddhus wr9, wr1, wr5 \n\t"
1091 "waddhus wr10, wr2, wr6 \n\t"
1092 "waddhus wr11, wr3, wr7 \n\t"
1093 "waddhus wr8, wr8, wr15 \n\t"
1094 "waddhus wr9, wr9, wr15 \n\t"
1095 "waddhus wr10, wr10, wr15 \n\t"
1096 "waddhus wr11, wr11, wr15 \n\t"
1097 "wsrlhg wr8, wr8, wcgr0 \n\t"
1098 "wsrlhg wr9, wr9, wcgr0 \n\t"
1099 "wldrd wr12, [%[block]] \n\t"
1100 "wldrd wr13, [%[block], #8] \n\t"
1101 "wsrlhg wr10, wr10, wcgr0 \n\t"
1102 "wsrlhg wr11, wr11, wcgr0 \n\t"
1103 "wpackhus wr8, wr8, wr9 \n\t"
1104 "wpackhus wr9, wr10, wr11 \n\t"
1105 WAVG2B" wr8, wr8, wr12 \n\t"
1106 WAVG2B" wr9, wr9, wr13 \n\t"
1107 "wstrd wr8, [%[block]] \n\t"
1108 "wstrd wr9, [%[block], #8] \n\t"
1109 "add %[block], %[block], %[line_size] \n\t"
1110 "subs %[h], %[h], #2 \n\t"
1111 "pld [%[block]] \n\t"
1112 "pld [%[block], #32] \n\t"
1113 "bne 1b \n\t"
1114 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
1115 : [line_size]"r"(line_size)
1116 : "r12", "memory");
1117 }
1118
1119 #endif /* FFMPEG_DSPUTIL_IWMMXT_RND_H */