Add official LGPL license headers to the files that were missing them.
[libav.git] / libavcodec / armv4l / dsputil_iwmmxt_rnd.h
1 /*
2 * iWMMXt optimized DSP utils
3 * copyright (c) 2004 AGAWA Koji
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
21 {
22 int stride = line_size;
23 __asm__ __volatile__ (
24 "and r12, %[pixels], #7 \n\t"
25 "bic %[pixels], %[pixels], #7 \n\t"
26 "tmcr wcgr1, r12 \n\t"
27 "add r4, %[pixels], %[line_size] \n\t"
28 "add r5, %[block], %[line_size] \n\t"
29 "mov %[line_size], %[line_size], lsl #1 \n\t"
30 "1: \n\t"
31 "wldrd wr0, [%[pixels]] \n\t"
32 "subs %[h], %[h], #2 \n\t"
33 "wldrd wr1, [%[pixels], #8] \n\t"
34 "add %[pixels], %[pixels], %[line_size] \n\t"
35 "wldrd wr3, [r4] \n\t"
36 "pld [%[pixels]] \n\t"
37 "pld [%[pixels], #32] \n\t"
38 "wldrd wr4, [r4, #8] \n\t"
39 "add r4, r4, %[line_size] \n\t"
40 "walignr1 wr8, wr0, wr1 \n\t"
41 "pld [r4] \n\t"
42 "pld [r4, #32] \n\t"
43 "walignr1 wr10, wr3, wr4 \n\t"
44 "wstrd wr8, [%[block]] \n\t"
45 "add %[block], %[block], %[line_size] \n\t"
46 "wstrd wr10, [r5] \n\t"
47 "add r5, r5, %[line_size] \n\t"
48 "bne 1b \n\t"
49 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
50 :
51 : "memory", "r4", "r5", "r12");
52 }
53
54 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
55 {
56 int stride = line_size;
57 __asm__ __volatile__ (
58 "and r12, %[pixels], #7 \n\t"
59 "bic %[pixels], %[pixels], #7 \n\t"
60 "tmcr wcgr1, r12 \n\t"
61 "add r4, %[pixels], %[line_size] \n\t"
62 "add r5, %[block], %[line_size] \n\t"
63 "mov %[line_size], %[line_size], lsl #1 \n\t"
64 "1: \n\t"
65 "wldrd wr0, [%[pixels]] \n\t"
66 "subs %[h], %[h], #2 \n\t"
67 "wldrd wr1, [%[pixels], #8] \n\t"
68 "add %[pixels], %[pixels], %[line_size] \n\t"
69 "wldrd wr3, [r4] \n\t"
70 "pld [%[pixels]] \n\t"
71 "pld [%[pixels], #32] \n\t"
72 "wldrd wr4, [r4, #8] \n\t"
73 "add r4, r4, %[line_size] \n\t"
74 "walignr1 wr8, wr0, wr1 \n\t"
75 "wldrd wr0, [%[block]] \n\t"
76 "wldrd wr2, [r5] \n\t"
77 "pld [r4] \n\t"
78 "pld [r4, #32] \n\t"
79 "walignr1 wr10, wr3, wr4 \n\t"
80 WAVG2B" wr8, wr8, wr0 \n\t"
81 WAVG2B" wr10, wr10, wr2 \n\t"
82 "wstrd wr8, [%[block]] \n\t"
83 "add %[block], %[block], %[line_size] \n\t"
84 "wstrd wr10, [r5] \n\t"
85 "pld [%[block]] \n\t"
86 "pld [%[block], #32] \n\t"
87 "add r5, r5, %[line_size] \n\t"
88 "pld [r5] \n\t"
89 "pld [r5, #32] \n\t"
90 "bne 1b \n\t"
91 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
92 :
93 : "memory", "r4", "r5", "r12");
94 }
95
96 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
97 {
98 int stride = line_size;
99 __asm__ __volatile__ (
100 "and r12, %[pixels], #7 \n\t"
101 "bic %[pixels], %[pixels], #7 \n\t"
102 "tmcr wcgr1, r12 \n\t"
103 "add r4, %[pixels], %[line_size] \n\t"
104 "add r5, %[block], %[line_size] \n\t"
105 "mov %[line_size], %[line_size], lsl #1 \n\t"
106 "1: \n\t"
107 "wldrd wr0, [%[pixels]] \n\t"
108 "wldrd wr1, [%[pixels], #8] \n\t"
109 "subs %[h], %[h], #2 \n\t"
110 "wldrd wr2, [%[pixels], #16] \n\t"
111 "add %[pixels], %[pixels], %[line_size] \n\t"
112 "wldrd wr3, [r4] \n\t"
113 "pld [%[pixels]] \n\t"
114 "pld [%[pixels], #32] \n\t"
115 "walignr1 wr8, wr0, wr1 \n\t"
116 "wldrd wr4, [r4, #8] \n\t"
117 "walignr1 wr9, wr1, wr2 \n\t"
118 "wldrd wr5, [r4, #16] \n\t"
119 "add r4, r4, %[line_size] \n\t"
120 "pld [r4] \n\t"
121 "pld [r4, #32] \n\t"
122 "walignr1 wr10, wr3, wr4 \n\t"
123 "wstrd wr8, [%[block]] \n\t"
124 "walignr1 wr11, wr4, wr5 \n\t"
125 "wstrd wr9, [%[block], #8] \n\t"
126 "add %[block], %[block], %[line_size] \n\t"
127 "wstrd wr10, [r5] \n\t"
128 "wstrd wr11, [r5, #8] \n\t"
129 "add r5, r5, %[line_size] \n\t"
130 "bne 1b \n\t"
131 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
132 :
133 : "memory", "r4", "r5", "r12");
134 }
135
136 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
137 {
138 int stride = line_size;
139 __asm__ __volatile__ (
140 "pld [%[pixels]] \n\t"
141 "pld [%[pixels], #32] \n\t"
142 "pld [%[block]] \n\t"
143 "pld [%[block], #32] \n\t"
144 "and r12, %[pixels], #7 \n\t"
145 "bic %[pixels], %[pixels], #7 \n\t"
146 "tmcr wcgr1, r12 \n\t"
147 "add r4, %[pixels], %[line_size]\n\t"
148 "add r5, %[block], %[line_size] \n\t"
149 "mov %[line_size], %[line_size], lsl #1 \n\t"
150 "1: \n\t"
151 "wldrd wr0, [%[pixels]] \n\t"
152 "wldrd wr1, [%[pixels], #8] \n\t"
153 "subs %[h], %[h], #2 \n\t"
154 "wldrd wr2, [%[pixels], #16] \n\t"
155 "add %[pixels], %[pixels], %[line_size] \n\t"
156 "wldrd wr3, [r4] \n\t"
157 "pld [%[pixels]] \n\t"
158 "pld [%[pixels], #32] \n\t"
159 "walignr1 wr8, wr0, wr1 \n\t"
160 "wldrd wr4, [r4, #8] \n\t"
161 "walignr1 wr9, wr1, wr2 \n\t"
162 "wldrd wr5, [r4, #16] \n\t"
163 "add r4, r4, %[line_size] \n\t"
164 "wldrd wr0, [%[block]] \n\t"
165 "pld [r4] \n\t"
166 "wldrd wr1, [%[block], #8] \n\t"
167 "pld [r4, #32] \n\t"
168 "wldrd wr2, [r5] \n\t"
169 "walignr1 wr10, wr3, wr4 \n\t"
170 "wldrd wr3, [r5, #8] \n\t"
171 WAVG2B" wr8, wr8, wr0 \n\t"
172 WAVG2B" wr9, wr9, wr1 \n\t"
173 WAVG2B" wr10, wr10, wr2 \n\t"
174 "wstrd wr8, [%[block]] \n\t"
175 "walignr1 wr11, wr4, wr5 \n\t"
176 WAVG2B" wr11, wr11, wr3 \n\t"
177 "wstrd wr9, [%[block], #8] \n\t"
178 "add %[block], %[block], %[line_size] \n\t"
179 "wstrd wr10, [r5] \n\t"
180 "pld [%[block]] \n\t"
181 "pld [%[block], #32] \n\t"
182 "wstrd wr11, [r5, #8] \n\t"
183 "add r5, r5, %[line_size] \n\t"
184 "pld [r5] \n\t"
185 "pld [r5, #32] \n\t"
186 "bne 1b \n\t"
187 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
188 :
189 : "memory", "r4", "r5", "r12");
190 }
191
192 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
193 {
194 int stride = line_size;
195 // [wr0 wr1 wr2 wr3] for previous line
196 // [wr4 wr5 wr6 wr7] for current line
197 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
198 __asm__ __volatile__(
199 "pld [%[pixels]] \n\t"
200 "pld [%[pixels], #32] \n\t"
201 "and r12, %[pixels], #7 \n\t"
202 "bic %[pixels], %[pixels], #7 \n\t"
203 "tmcr wcgr1, r12 \n\t"
204 "add r12, r12, #1 \n\t"
205 "add r4, %[pixels], %[line_size]\n\t"
206 "tmcr wcgr2, r12 \n\t"
207 "add r5, %[block], %[line_size] \n\t"
208 "mov %[line_size], %[line_size], lsl #1 \n\t"
209
210 "1: \n\t"
211 "wldrd wr10, [%[pixels]] \n\t"
212 "cmp r12, #8 \n\t"
213 "wldrd wr11, [%[pixels], #8] \n\t"
214 "add %[pixels], %[pixels], %[line_size] \n\t"
215 "wldrd wr13, [r4] \n\t"
216 "pld [%[pixels]] \n\t"
217 "wldrd wr14, [r4, #8] \n\t"
218 "pld [%[pixels], #32] \n\t"
219 "add r4, r4, %[line_size] \n\t"
220 "walignr1 wr0, wr10, wr11 \n\t"
221 "pld [r4] \n\t"
222 "pld [r4, #32] \n\t"
223 "walignr1 wr2, wr13, wr14 \n\t"
224 "wmoveq wr4, wr11 \n\t"
225 "wmoveq wr6, wr14 \n\t"
226 "walignr2ne wr4, wr10, wr11 \n\t"
227 "walignr2ne wr6, wr13, wr14 \n\t"
228 WAVG2B" wr0, wr0, wr4 \n\t"
229 WAVG2B" wr2, wr2, wr6 \n\t"
230 "wstrd wr0, [%[block]] \n\t"
231 "subs %[h], %[h], #2 \n\t"
232 "wstrd wr2, [r5] \n\t"
233 "add %[block], %[block], %[line_size] \n\t"
234 "add r5, r5, %[line_size] \n\t"
235 "bne 1b \n\t"
236 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
237 :
238 : "r4", "r5", "r12", "memory");
239 }
240
241 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
242 {
243 int stride = line_size;
244 // [wr0 wr1 wr2 wr3] for previous line
245 // [wr4 wr5 wr6 wr7] for current line
246 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
247 __asm__ __volatile__(
248 "pld [%[pixels]] \n\t"
249 "pld [%[pixels], #32] \n\t"
250 "and r12, %[pixels], #7 \n\t"
251 "bic %[pixels], %[pixels], #7 \n\t"
252 "tmcr wcgr1, r12 \n\t"
253 "add r12, r12, #1 \n\t"
254 "add r4, %[pixels], %[line_size]\n\t"
255 "tmcr wcgr2, r12 \n\t"
256 "add r5, %[block], %[line_size] \n\t"
257 "mov %[line_size], %[line_size], lsl #1 \n\t"
258
259 "1: \n\t"
260 "wldrd wr10, [%[pixels]] \n\t"
261 "cmp r12, #8 \n\t"
262 "wldrd wr11, [%[pixels], #8] \n\t"
263 "wldrd wr12, [%[pixels], #16] \n\t"
264 "add %[pixels], %[pixels], %[line_size] \n\t"
265 "wldrd wr13, [r4] \n\t"
266 "pld [%[pixels]] \n\t"
267 "wldrd wr14, [r4, #8] \n\t"
268 "pld [%[pixels], #32] \n\t"
269 "wldrd wr15, [r4, #16] \n\t"
270 "add r4, r4, %[line_size] \n\t"
271 "walignr1 wr0, wr10, wr11 \n\t"
272 "pld [r4] \n\t"
273 "pld [r4, #32] \n\t"
274 "walignr1 wr1, wr11, wr12 \n\t"
275 "walignr1 wr2, wr13, wr14 \n\t"
276 "walignr1 wr3, wr14, wr15 \n\t"
277 "wmoveq wr4, wr11 \n\t"
278 "wmoveq wr5, wr12 \n\t"
279 "wmoveq wr6, wr14 \n\t"
280 "wmoveq wr7, wr15 \n\t"
281 "walignr2ne wr4, wr10, wr11 \n\t"
282 "walignr2ne wr5, wr11, wr12 \n\t"
283 "walignr2ne wr6, wr13, wr14 \n\t"
284 "walignr2ne wr7, wr14, wr15 \n\t"
285 WAVG2B" wr0, wr0, wr4 \n\t"
286 WAVG2B" wr1, wr1, wr5 \n\t"
287 "wstrd wr0, [%[block]] \n\t"
288 WAVG2B" wr2, wr2, wr6 \n\t"
289 "wstrd wr1, [%[block], #8] \n\t"
290 WAVG2B" wr3, wr3, wr7 \n\t"
291 "add %[block], %[block], %[line_size] \n\t"
292 "wstrd wr2, [r5] \n\t"
293 "subs %[h], %[h], #2 \n\t"
294 "wstrd wr3, [r5, #8] \n\t"
295 "add r5, r5, %[line_size] \n\t"
296 "bne 1b \n\t"
297 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
298 :
299 : "r4", "r5", "r12", "memory");
300 }
301
302 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
303 {
304 int stride = line_size;
305 // [wr0 wr1 wr2 wr3] for previous line
306 // [wr4 wr5 wr6 wr7] for current line
307 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
308 __asm__ __volatile__(
309 "pld [%[pixels]] \n\t"
310 "pld [%[pixels], #32] \n\t"
311 "pld [%[block]] \n\t"
312 "pld [%[block], #32] \n\t"
313 "and r12, %[pixels], #7 \n\t"
314 "bic %[pixels], %[pixels], #7 \n\t"
315 "tmcr wcgr1, r12 \n\t"
316 "add r12, r12, #1 \n\t"
317 "add r4, %[pixels], %[line_size]\n\t"
318 "tmcr wcgr2, r12 \n\t"
319 "add r5, %[block], %[line_size] \n\t"
320 "mov %[line_size], %[line_size], lsl #1 \n\t"
321 "pld [r5] \n\t"
322 "pld [r5, #32] \n\t"
323
324 "1: \n\t"
325 "wldrd wr10, [%[pixels]] \n\t"
326 "cmp r12, #8 \n\t"
327 "wldrd wr11, [%[pixels], #8] \n\t"
328 "add %[pixels], %[pixels], %[line_size] \n\t"
329 "wldrd wr13, [r4] \n\t"
330 "pld [%[pixels]] \n\t"
331 "wldrd wr14, [r4, #8] \n\t"
332 "pld [%[pixels], #32] \n\t"
333 "add r4, r4, %[line_size] \n\t"
334 "walignr1 wr0, wr10, wr11 \n\t"
335 "pld [r4] \n\t"
336 "pld [r4, #32] \n\t"
337 "walignr1 wr2, wr13, wr14 \n\t"
338 "wmoveq wr4, wr11 \n\t"
339 "wmoveq wr6, wr14 \n\t"
340 "walignr2ne wr4, wr10, wr11 \n\t"
341 "wldrd wr10, [%[block]] \n\t"
342 "walignr2ne wr6, wr13, wr14 \n\t"
343 "wldrd wr12, [r5] \n\t"
344 WAVG2B" wr0, wr0, wr4 \n\t"
345 WAVG2B" wr2, wr2, wr6 \n\t"
346 WAVG2B" wr0, wr0, wr10 \n\t"
347 WAVG2B" wr2, wr2, wr12 \n\t"
348 "wstrd wr0, [%[block]] \n\t"
349 "subs %[h], %[h], #2 \n\t"
350 "wstrd wr2, [r5] \n\t"
351 "add %[block], %[block], %[line_size] \n\t"
352 "add r5, r5, %[line_size] \n\t"
353 "pld [%[block]] \n\t"
354 "pld [%[block], #32] \n\t"
355 "pld [r5] \n\t"
356 "pld [r5, #32] \n\t"
357 "bne 1b \n\t"
358 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
359 :
360 : "r4", "r5", "r12", "memory");
361 }
362
363 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
364 {
365 int stride = line_size;
366 // [wr0 wr1 wr2 wr3] for previous line
367 // [wr4 wr5 wr6 wr7] for current line
368 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
369 __asm__ __volatile__(
370 "pld [%[pixels]] \n\t"
371 "pld [%[pixels], #32] \n\t"
372 "pld [%[block]] \n\t"
373 "pld [%[block], #32] \n\t"
374 "and r12, %[pixels], #7 \n\t"
375 "bic %[pixels], %[pixels], #7 \n\t"
376 "tmcr wcgr1, r12 \n\t"
377 "add r12, r12, #1 \n\t"
378 "add r4, %[pixels], %[line_size]\n\t"
379 "tmcr wcgr2, r12 \n\t"
380 "add r5, %[block], %[line_size] \n\t"
381 "mov %[line_size], %[line_size], lsl #1 \n\t"
382 "pld [r5] \n\t"
383 "pld [r5, #32] \n\t"
384
385 "1: \n\t"
386 "wldrd wr10, [%[pixels]] \n\t"
387 "cmp r12, #8 \n\t"
388 "wldrd wr11, [%[pixels], #8] \n\t"
389 "wldrd wr12, [%[pixels], #16] \n\t"
390 "add %[pixels], %[pixels], %[line_size] \n\t"
391 "wldrd wr13, [r4] \n\t"
392 "pld [%[pixels]] \n\t"
393 "wldrd wr14, [r4, #8] \n\t"
394 "pld [%[pixels], #32] \n\t"
395 "wldrd wr15, [r4, #16] \n\t"
396 "add r4, r4, %[line_size] \n\t"
397 "walignr1 wr0, wr10, wr11 \n\t"
398 "pld [r4] \n\t"
399 "pld [r4, #32] \n\t"
400 "walignr1 wr1, wr11, wr12 \n\t"
401 "walignr1 wr2, wr13, wr14 \n\t"
402 "walignr1 wr3, wr14, wr15 \n\t"
403 "wmoveq wr4, wr11 \n\t"
404 "wmoveq wr5, wr12 \n\t"
405 "wmoveq wr6, wr14 \n\t"
406 "wmoveq wr7, wr15 \n\t"
407 "walignr2ne wr4, wr10, wr11 \n\t"
408 "walignr2ne wr5, wr11, wr12 \n\t"
409 "walignr2ne wr6, wr13, wr14 \n\t"
410 "walignr2ne wr7, wr14, wr15 \n\t"
411 "wldrd wr10, [%[block]] \n\t"
412 WAVG2B" wr0, wr0, wr4 \n\t"
413 "wldrd wr11, [%[block], #8] \n\t"
414 WAVG2B" wr1, wr1, wr5 \n\t"
415 "wldrd wr12, [r5] \n\t"
416 WAVG2B" wr2, wr2, wr6 \n\t"
417 "wldrd wr13, [r5, #8] \n\t"
418 WAVG2B" wr3, wr3, wr7 \n\t"
419 WAVG2B" wr0, wr0, wr10 \n\t"
420 WAVG2B" wr1, wr1, wr11 \n\t"
421 WAVG2B" wr2, wr2, wr12 \n\t"
422 WAVG2B" wr3, wr3, wr13 \n\t"
423 "wstrd wr0, [%[block]] \n\t"
424 "subs %[h], %[h], #2 \n\t"
425 "wstrd wr1, [%[block], #8] \n\t"
426 "add %[block], %[block], %[line_size] \n\t"
427 "wstrd wr2, [r5] \n\t"
428 "pld [%[block]] \n\t"
429 "wstrd wr3, [r5, #8] \n\t"
430 "add r5, r5, %[line_size] \n\t"
431 "pld [%[block], #32] \n\t"
432 "pld [r5] \n\t"
433 "pld [r5, #32] \n\t"
434 "bne 1b \n\t"
435 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
436 :
437 :"r4", "r5", "r12", "memory");
438 }
439
440 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
441 {
442 int stride = line_size;
443 // [wr0 wr1 wr2 wr3] for previous line
444 // [wr4 wr5 wr6 wr7] for current line
445 __asm__ __volatile__(
446 "pld [%[pixels]] \n\t"
447 "pld [%[pixels], #32] \n\t"
448 "and r12, %[pixels], #7 \n\t"
449 "tmcr wcgr1, r12 \n\t"
450 "bic %[pixels], %[pixels], #7 \n\t"
451
452 "wldrd wr10, [%[pixels]] \n\t"
453 "wldrd wr11, [%[pixels], #8] \n\t"
454 "pld [%[block]] \n\t"
455 "add %[pixels], %[pixels], %[line_size] \n\t"
456 "walignr1 wr0, wr10, wr11 \n\t"
457 "pld [%[pixels]] \n\t"
458 "pld [%[pixels], #32] \n\t"
459
460 "1: \n\t"
461 "wldrd wr10, [%[pixels]] \n\t"
462 "wldrd wr11, [%[pixels], #8] \n\t"
463 "add %[pixels], %[pixels], %[line_size] \n\t"
464 "pld [%[pixels]] \n\t"
465 "pld [%[pixels], #32] \n\t"
466 "walignr1 wr4, wr10, wr11 \n\t"
467 "wldrd wr10, [%[block]] \n\t"
468 WAVG2B" wr8, wr0, wr4 \n\t"
469 WAVG2B" wr8, wr8, wr10 \n\t"
470 "wstrd wr8, [%[block]] \n\t"
471 "add %[block], %[block], %[line_size] \n\t"
472
473 "wldrd wr10, [%[pixels]] \n\t"
474 "wldrd wr11, [%[pixels], #8] \n\t"
475 "pld [%[block]] \n\t"
476 "add %[pixels], %[pixels], %[line_size] \n\t"
477 "pld [%[pixels]] \n\t"
478 "pld [%[pixels], #32] \n\t"
479 "walignr1 wr0, wr10, wr11 \n\t"
480 "wldrd wr10, [%[block]] \n\t"
481 WAVG2B" wr8, wr0, wr4 \n\t"
482 WAVG2B" wr8, wr8, wr10 \n\t"
483 "wstrd wr8, [%[block]] \n\t"
484 "add %[block], %[block], %[line_size] \n\t"
485
486 "subs %[h], %[h], #2 \n\t"
487 "pld [%[block]] \n\t"
488 "bne 1b \n\t"
489 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
490 :
491 : "cc", "memory", "r12");
492 }
493
494 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
495 {
496 int stride = line_size;
497 // [wr0 wr1 wr2 wr3] for previous line
498 // [wr4 wr5 wr6 wr7] for current line
499 __asm__ __volatile__(
500 "pld [%[pixels]] \n\t"
501 "pld [%[pixels], #32] \n\t"
502 "and r12, %[pixels], #7 \n\t"
503 "tmcr wcgr1, r12 \n\t"
504 "bic %[pixels], %[pixels], #7 \n\t"
505
506 "wldrd wr10, [%[pixels]] \n\t"
507 "wldrd wr11, [%[pixels], #8] \n\t"
508 "wldrd wr12, [%[pixels], #16] \n\t"
509 "add %[pixels], %[pixels], %[line_size] \n\t"
510 "pld [%[pixels]] \n\t"
511 "pld [%[pixels], #32] \n\t"
512 "walignr1 wr0, wr10, wr11 \n\t"
513 "walignr1 wr1, wr11, wr12 \n\t"
514
515 "1: \n\t"
516 "wldrd wr10, [%[pixels]] \n\t"
517 "wldrd wr11, [%[pixels], #8] \n\t"
518 "wldrd wr12, [%[pixels], #16] \n\t"
519 "add %[pixels], %[pixels], %[line_size] \n\t"
520 "pld [%[pixels]] \n\t"
521 "pld [%[pixels], #32] \n\t"
522 "walignr1 wr4, wr10, wr11 \n\t"
523 "walignr1 wr5, wr11, wr12 \n\t"
524 WAVG2B" wr8, wr0, wr4 \n\t"
525 WAVG2B" wr9, wr1, wr5 \n\t"
526 "wstrd wr8, [%[block]] \n\t"
527 "wstrd wr9, [%[block], #8] \n\t"
528 "add %[block], %[block], %[line_size] \n\t"
529
530 "wldrd wr10, [%[pixels]] \n\t"
531 "wldrd wr11, [%[pixels], #8] \n\t"
532 "wldrd wr12, [%[pixels], #16] \n\t"
533 "add %[pixels], %[pixels], %[line_size] \n\t"
534 "pld [%[pixels]] \n\t"
535 "pld [%[pixels], #32] \n\t"
536 "walignr1 wr0, wr10, wr11 \n\t"
537 "walignr1 wr1, wr11, wr12 \n\t"
538 WAVG2B" wr8, wr0, wr4 \n\t"
539 WAVG2B" wr9, wr1, wr5 \n\t"
540 "wstrd wr8, [%[block]] \n\t"
541 "wstrd wr9, [%[block], #8] \n\t"
542 "add %[block], %[block], %[line_size] \n\t"
543
544 "subs %[h], %[h], #2 \n\t"
545 "bne 1b \n\t"
546 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
547 :
548 : "r4", "r5", "r12", "memory");
549 }
550
551 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
552 {
553 int stride = line_size;
554 // [wr0 wr1 wr2 wr3] for previous line
555 // [wr4 wr5 wr6 wr7] for current line
556 __asm__ __volatile__(
557 "pld [%[pixels]] \n\t"
558 "pld [%[pixels], #32] \n\t"
559 "and r12, %[pixels], #7 \n\t"
560 "tmcr wcgr1, r12 \n\t"
561 "bic %[pixels], %[pixels], #7 \n\t"
562
563 "wldrd wr10, [%[pixels]] \n\t"
564 "wldrd wr11, [%[pixels], #8] \n\t"
565 "pld [%[block]] \n\t"
566 "wldrd wr12, [%[pixels], #16] \n\t"
567 "add %[pixels], %[pixels], %[line_size] \n\t"
568 "pld [%[pixels]] \n\t"
569 "pld [%[pixels], #32] \n\t"
570 "walignr1 wr0, wr10, wr11 \n\t"
571 "walignr1 wr1, wr11, wr12 \n\t"
572
573 "1: \n\t"
574 "wldrd wr10, [%[pixels]] \n\t"
575 "wldrd wr11, [%[pixels], #8] \n\t"
576 "wldrd wr12, [%[pixels], #16] \n\t"
577 "add %[pixels], %[pixels], %[line_size] \n\t"
578 "pld [%[pixels]] \n\t"
579 "pld [%[pixels], #32] \n\t"
580 "walignr1 wr4, wr10, wr11 \n\t"
581 "walignr1 wr5, wr11, wr12 \n\t"
582 "wldrd wr10, [%[block]] \n\t"
583 "wldrd wr11, [%[block], #8] \n\t"
584 WAVG2B" wr8, wr0, wr4 \n\t"
585 WAVG2B" wr9, wr1, wr5 \n\t"
586 WAVG2B" wr8, wr8, wr10 \n\t"
587 WAVG2B" wr9, wr9, wr11 \n\t"
588 "wstrd wr8, [%[block]] \n\t"
589 "wstrd wr9, [%[block], #8] \n\t"
590 "add %[block], %[block], %[line_size] \n\t"
591
592 "wldrd wr10, [%[pixels]] \n\t"
593 "wldrd wr11, [%[pixels], #8] \n\t"
594 "pld [%[block]] \n\t"
595 "wldrd wr12, [%[pixels], #16] \n\t"
596 "add %[pixels], %[pixels], %[line_size] \n\t"
597 "pld [%[pixels]] \n\t"
598 "pld [%[pixels], #32] \n\t"
599 "walignr1 wr0, wr10, wr11 \n\t"
600 "walignr1 wr1, wr11, wr12 \n\t"
601 "wldrd wr10, [%[block]] \n\t"
602 "wldrd wr11, [%[block], #8] \n\t"
603 WAVG2B" wr8, wr0, wr4 \n\t"
604 WAVG2B" wr9, wr1, wr5 \n\t"
605 WAVG2B" wr8, wr8, wr10 \n\t"
606 WAVG2B" wr9, wr9, wr11 \n\t"
607 "wstrd wr8, [%[block]] \n\t"
608 "wstrd wr9, [%[block], #8] \n\t"
609 "add %[block], %[block], %[line_size] \n\t"
610
611 "subs %[h], %[h], #2 \n\t"
612 "pld [%[block]] \n\t"
613 "bne 1b \n\t"
614 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
615 :
616 : "r4", "r5", "r12", "memory");
617 }
618
619 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
620 {
621 // [wr0 wr1 wr2 wr3] for previous line
622 // [wr4 wr5 wr6 wr7] for current line
623 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
624 __asm__ __volatile__(
625 "pld [%[pixels]] \n\t"
626 "mov r12, #2 \n\t"
627 "pld [%[pixels], #32] \n\t"
628 "tmcr wcgr0, r12 \n\t" /* for shift value */
629 "and r12, %[pixels], #7 \n\t"
630 "bic %[pixels], %[pixels], #7 \n\t"
631 "tmcr wcgr1, r12 \n\t"
632
633 // [wr0 wr1 wr2 wr3] <= *
634 // [wr4 wr5 wr6 wr7]
635 "wldrd wr12, [%[pixels]] \n\t"
636 "add r12, r12, #1 \n\t"
637 "wldrd wr13, [%[pixels], #8] \n\t"
638 "tmcr wcgr2, r12 \n\t"
639 "add %[pixels], %[pixels], %[line_size] \n\t"
640 "cmp r12, #8 \n\t"
641 "pld [%[pixels]] \n\t"
642 "pld [%[pixels], #32] \n\t"
643 "walignr1 wr2, wr12, wr13 \n\t"
644 "wmoveq wr10, wr13 \n\t"
645 "walignr2ne wr10, wr12, wr13 \n\t"
646 "wunpckelub wr0, wr2 \n\t"
647 "wunpckehub wr1, wr2 \n\t"
648 "wunpckelub wr8, wr10 \n\t"
649 "wunpckehub wr9, wr10 \n\t"
650 "waddhus wr0, wr0, wr8 \n\t"
651 "waddhus wr1, wr1, wr9 \n\t"
652
653 "1: \n\t"
654 // [wr0 wr1 wr2 wr3]
655 // [wr4 wr5 wr6 wr7] <= *
656 "wldrd wr12, [%[pixels]] \n\t"
657 "cmp r12, #8 \n\t"
658 "wldrd wr13, [%[pixels], #8] \n\t"
659 "add %[pixels], %[pixels], %[line_size] \n\t"
660 "walignr1 wr6, wr12, wr13 \n\t"
661 "pld [%[pixels]] \n\t"
662 "pld [%[pixels], #32] \n\t"
663 "wmoveq wr10, wr13 \n\t"
664 "walignr2ne wr10, wr12, wr13 \n\t"
665 "wunpckelub wr4, wr6 \n\t"
666 "wunpckehub wr5, wr6 \n\t"
667 "wunpckelub wr8, wr10 \n\t"
668 "wunpckehub wr9, wr10 \n\t"
669 "waddhus wr4, wr4, wr8 \n\t"
670 "waddhus wr5, wr5, wr9 \n\t"
671 "waddhus wr8, wr0, wr4 \n\t"
672 "waddhus wr9, wr1, wr5 \n\t"
673 "waddhus wr8, wr8, wr15 \n\t"
674 "waddhus wr9, wr9, wr15 \n\t"
675 "wsrlhg wr8, wr8, wcgr0 \n\t"
676 "wsrlhg wr9, wr9, wcgr0 \n\t"
677 "wpackhus wr8, wr8, wr9 \n\t"
678 "wstrd wr8, [%[block]] \n\t"
679 "add %[block], %[block], %[line_size] \n\t"
680
681 // [wr0 wr1 wr2 wr3] <= *
682 // [wr4 wr5 wr6 wr7]
683 "wldrd wr12, [%[pixels]] \n\t"
684 "wldrd wr13, [%[pixels], #8] \n\t"
685 "add %[pixels], %[pixels], %[line_size] \n\t"
686 "walignr1 wr2, wr12, wr13 \n\t"
687 "pld [%[pixels]] \n\t"
688 "pld [%[pixels], #32] \n\t"
689 "wmoveq wr10, wr13 \n\t"
690 "walignr2ne wr10, wr12, wr13 \n\t"
691 "wunpckelub wr0, wr2 \n\t"
692 "wunpckehub wr1, wr2 \n\t"
693 "wunpckelub wr8, wr10 \n\t"
694 "wunpckehub wr9, wr10 \n\t"
695 "waddhus wr0, wr0, wr8 \n\t"
696 "waddhus wr1, wr1, wr9 \n\t"
697 "waddhus wr8, wr0, wr4 \n\t"
698 "waddhus wr9, wr1, wr5 \n\t"
699 "waddhus wr8, wr8, wr15 \n\t"
700 "waddhus wr9, wr9, wr15 \n\t"
701 "wsrlhg wr8, wr8, wcgr0 \n\t"
702 "wsrlhg wr9, wr9, wcgr0 \n\t"
703 "wpackhus wr8, wr8, wr9 \n\t"
704 "subs %[h], %[h], #2 \n\t"
705 "wstrd wr8, [%[block]] \n\t"
706 "add %[block], %[block], %[line_size] \n\t"
707 "bne 1b \n\t"
708 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
709 : [line_size]"r"(line_size)
710 : "r12", "memory");
711 }
712
713 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
714 {
715 // [wr0 wr1 wr2 wr3] for previous line
716 // [wr4 wr5 wr6 wr7] for current line
717 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
718 __asm__ __volatile__(
719 "pld [%[pixels]] \n\t"
720 "mov r12, #2 \n\t"
721 "pld [%[pixels], #32] \n\t"
722 "tmcr wcgr0, r12 \n\t" /* for shift value */
723 /* alignment */
724 "and r12, %[pixels], #7 \n\t"
725 "bic %[pixels], %[pixels], #7 \n\t"
726 "tmcr wcgr1, r12 \n\t"
727 "add r12, r12, #1 \n\t"
728 "tmcr wcgr2, r12 \n\t"
729
730 // [wr0 wr1 wr2 wr3] <= *
731 // [wr4 wr5 wr6 wr7]
732 "wldrd wr12, [%[pixels]] \n\t"
733 "cmp r12, #8 \n\t"
734 "wldrd wr13, [%[pixels], #8] \n\t"
735 "wldrd wr14, [%[pixels], #16] \n\t"
736 "add %[pixels], %[pixels], %[line_size] \n\t"
737 "pld [%[pixels]] \n\t"
738 "walignr1 wr2, wr12, wr13 \n\t"
739 "pld [%[pixels], #32] \n\t"
740 "walignr1 wr3, wr13, wr14 \n\t"
741 "wmoveq wr10, wr13 \n\t"
742 "wmoveq wr11, wr14 \n\t"
743 "walignr2ne wr10, wr12, wr13 \n\t"
744 "walignr2ne wr11, wr13, wr14 \n\t"
745 "wunpckelub wr0, wr2 \n\t"
746 "wunpckehub wr1, wr2 \n\t"
747 "wunpckelub wr2, wr3 \n\t"
748 "wunpckehub wr3, wr3 \n\t"
749 "wunpckelub wr8, wr10 \n\t"
750 "wunpckehub wr9, wr10 \n\t"
751 "wunpckelub wr10, wr11 \n\t"
752 "wunpckehub wr11, wr11 \n\t"
753 "waddhus wr0, wr0, wr8 \n\t"
754 "waddhus wr1, wr1, wr9 \n\t"
755 "waddhus wr2, wr2, wr10 \n\t"
756 "waddhus wr3, wr3, wr11 \n\t"
757
758 "1: \n\t"
759 // [wr0 wr1 wr2 wr3]
760 // [wr4 wr5 wr6 wr7] <= *
761 "wldrd wr12, [%[pixels]] \n\t"
762 "cmp r12, #8 \n\t"
763 "wldrd wr13, [%[pixels], #8] \n\t"
764 "wldrd wr14, [%[pixels], #16] \n\t"
765 "add %[pixels], %[pixels], %[line_size] \n\t"
766 "walignr1 wr6, wr12, wr13 \n\t"
767 "pld [%[pixels]] \n\t"
768 "pld [%[pixels], #32] \n\t"
769 "walignr1 wr7, wr13, wr14 \n\t"
770 "wmoveq wr10, wr13 \n\t"
771 "wmoveq wr11, wr14 \n\t"
772 "walignr2ne wr10, wr12, wr13 \n\t"
773 "walignr2ne wr11, wr13, wr14 \n\t"
774 "wunpckelub wr4, wr6 \n\t"
775 "wunpckehub wr5, wr6 \n\t"
776 "wunpckelub wr6, wr7 \n\t"
777 "wunpckehub wr7, wr7 \n\t"
778 "wunpckelub wr8, wr10 \n\t"
779 "wunpckehub wr9, wr10 \n\t"
780 "wunpckelub wr10, wr11 \n\t"
781 "wunpckehub wr11, wr11 \n\t"
782 "waddhus wr4, wr4, wr8 \n\t"
783 "waddhus wr5, wr5, wr9 \n\t"
784 "waddhus wr6, wr6, wr10 \n\t"
785 "waddhus wr7, wr7, wr11 \n\t"
786 "waddhus wr8, wr0, wr4 \n\t"
787 "waddhus wr9, wr1, wr5 \n\t"
788 "waddhus wr10, wr2, wr6 \n\t"
789 "waddhus wr11, wr3, wr7 \n\t"
790 "waddhus wr8, wr8, wr15 \n\t"
791 "waddhus wr9, wr9, wr15 \n\t"
792 "waddhus wr10, wr10, wr15 \n\t"
793 "waddhus wr11, wr11, wr15 \n\t"
794 "wsrlhg wr8, wr8, wcgr0 \n\t"
795 "wsrlhg wr9, wr9, wcgr0 \n\t"
796 "wsrlhg wr10, wr10, wcgr0 \n\t"
797 "wsrlhg wr11, wr11, wcgr0 \n\t"
798 "wpackhus wr8, wr8, wr9 \n\t"
799 "wpackhus wr9, wr10, wr11 \n\t"
800 "wstrd wr8, [%[block]] \n\t"
801 "wstrd wr9, [%[block], #8] \n\t"
802 "add %[block], %[block], %[line_size] \n\t"
803
804 // [wr0 wr1 wr2 wr3] <= *
805 // [wr4 wr5 wr6 wr7]
806 "wldrd wr12, [%[pixels]] \n\t"
807 "wldrd wr13, [%[pixels], #8] \n\t"
808 "wldrd wr14, [%[pixels], #16] \n\t"
809 "add %[pixels], %[pixels], %[line_size] \n\t"
810 "walignr1 wr2, wr12, wr13 \n\t"
811 "pld [%[pixels]] \n\t"
812 "pld [%[pixels], #32] \n\t"
813 "walignr1 wr3, wr13, wr14 \n\t"
814 "wmoveq wr10, wr13 \n\t"
815 "wmoveq wr11, wr14 \n\t"
816 "walignr2ne wr10, wr12, wr13 \n\t"
817 "walignr2ne wr11, wr13, wr14 \n\t"
818 "wunpckelub wr0, wr2 \n\t"
819 "wunpckehub wr1, wr2 \n\t"
820 "wunpckelub wr2, wr3 \n\t"
821 "wunpckehub wr3, wr3 \n\t"
822 "wunpckelub wr8, wr10 \n\t"
823 "wunpckehub wr9, wr10 \n\t"
824 "wunpckelub wr10, wr11 \n\t"
825 "wunpckehub wr11, wr11 \n\t"
826 "waddhus wr0, wr0, wr8 \n\t"
827 "waddhus wr1, wr1, wr9 \n\t"
828 "waddhus wr2, wr2, wr10 \n\t"
829 "waddhus wr3, wr3, wr11 \n\t"
830 "waddhus wr8, wr0, wr4 \n\t"
831 "waddhus wr9, wr1, wr5 \n\t"
832 "waddhus wr10, wr2, wr6 \n\t"
833 "waddhus wr11, wr3, wr7 \n\t"
834 "waddhus wr8, wr8, wr15 \n\t"
835 "waddhus wr9, wr9, wr15 \n\t"
836 "waddhus wr10, wr10, wr15 \n\t"
837 "waddhus wr11, wr11, wr15 \n\t"
838 "wsrlhg wr8, wr8, wcgr0 \n\t"
839 "wsrlhg wr9, wr9, wcgr0 \n\t"
840 "wsrlhg wr10, wr10, wcgr0 \n\t"
841 "wsrlhg wr11, wr11, wcgr0 \n\t"
842 "wpackhus wr8, wr8, wr9 \n\t"
843 "wpackhus wr9, wr10, wr11 \n\t"
844 "wstrd wr8, [%[block]] \n\t"
845 "wstrd wr9, [%[block], #8] \n\t"
846 "add %[block], %[block], %[line_size] \n\t"
847
848 "subs %[h], %[h], #2 \n\t"
849 "bne 1b \n\t"
850 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
851 : [line_size]"r"(line_size)
852 : "r12", "memory");
853 }
854
855 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
856 {
857 // [wr0 wr1 wr2 wr3] for previous line
858 // [wr4 wr5 wr6 wr7] for current line
859 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
860 __asm__ __volatile__(
861 "pld [%[block]] \n\t"
862 "pld [%[block], #32] \n\t"
863 "pld [%[pixels]] \n\t"
864 "mov r12, #2 \n\t"
865 "pld [%[pixels], #32] \n\t"
866 "tmcr wcgr0, r12 \n\t" /* for shift value */
867 "and r12, %[pixels], #7 \n\t"
868 "bic %[pixels], %[pixels], #7 \n\t"
869 "tmcr wcgr1, r12 \n\t"
870
871 // [wr0 wr1 wr2 wr3] <= *
872 // [wr4 wr5 wr6 wr7]
873 "wldrd wr12, [%[pixels]] \n\t"
874 "add r12, r12, #1 \n\t"
875 "wldrd wr13, [%[pixels], #8] \n\t"
876 "tmcr wcgr2, r12 \n\t"
877 "add %[pixels], %[pixels], %[line_size] \n\t"
878 "cmp r12, #8 \n\t"
879 "pld [%[pixels]] \n\t"
880 "pld [%[pixels], #32] \n\t"
881 "walignr1 wr2, wr12, wr13 \n\t"
882 "wmoveq wr10, wr13 \n\t"
883 "walignr2ne wr10, wr12, wr13 \n\t"
884 "wunpckelub wr0, wr2 \n\t"
885 "wunpckehub wr1, wr2 \n\t"
886 "wunpckelub wr8, wr10 \n\t"
887 "wunpckehub wr9, wr10 \n\t"
888 "waddhus wr0, wr0, wr8 \n\t"
889 "waddhus wr1, wr1, wr9 \n\t"
890
891 "1: \n\t"
892 // [wr0 wr1 wr2 wr3]
893 // [wr4 wr5 wr6 wr7] <= *
894 "wldrd wr12, [%[pixels]] \n\t"
895 "cmp r12, #8 \n\t"
896 "wldrd wr13, [%[pixels], #8] \n\t"
897 "add %[pixels], %[pixels], %[line_size] \n\t"
898 "walignr1 wr6, wr12, wr13 \n\t"
899 "pld [%[pixels]] \n\t"
900 "pld [%[pixels], #32] \n\t"
901 "wmoveq wr10, wr13 \n\t"
902 "walignr2ne wr10, wr12, wr13 \n\t"
903 "wunpckelub wr4, wr6 \n\t"
904 "wunpckehub wr5, wr6 \n\t"
905 "wunpckelub wr8, wr10 \n\t"
906 "wunpckehub wr9, wr10 \n\t"
907 "waddhus wr4, wr4, wr8 \n\t"
908 "waddhus wr5, wr5, wr9 \n\t"
909 "waddhus wr8, wr0, wr4 \n\t"
910 "waddhus wr9, wr1, wr5 \n\t"
911 "waddhus wr8, wr8, wr15 \n\t"
912 "waddhus wr9, wr9, wr15 \n\t"
913 "wldrd wr12, [%[block]] \n\t"
914 "wsrlhg wr8, wr8, wcgr0 \n\t"
915 "wsrlhg wr9, wr9, wcgr0 \n\t"
916 "wpackhus wr8, wr8, wr9 \n\t"
917 WAVG2B" wr8, wr8, wr12 \n\t"
918 "wstrd wr8, [%[block]] \n\t"
919 "add %[block], %[block], %[line_size] \n\t"
920 "wldrd wr12, [%[pixels]] \n\t"
921 "pld [%[block]] \n\t"
922 "pld [%[block], #32] \n\t"
923
924 // [wr0 wr1 wr2 wr3] <= *
925 // [wr4 wr5 wr6 wr7]
926 "wldrd wr13, [%[pixels], #8] \n\t"
927 "add %[pixels], %[pixels], %[line_size] \n\t"
928 "walignr1 wr2, wr12, wr13 \n\t"
929 "pld [%[pixels]] \n\t"
930 "pld [%[pixels], #32] \n\t"
931 "wmoveq wr10, wr13 \n\t"
932 "walignr2ne wr10, wr12, wr13 \n\t"
933 "wunpckelub wr0, wr2 \n\t"
934 "wunpckehub wr1, wr2 \n\t"
935 "wunpckelub wr8, wr10 \n\t"
936 "wunpckehub wr9, wr10 \n\t"
937 "waddhus wr0, wr0, wr8 \n\t"
938 "waddhus wr1, wr1, wr9 \n\t"
939 "waddhus wr8, wr0, wr4 \n\t"
940 "waddhus wr9, wr1, wr5 \n\t"
941 "waddhus wr8, wr8, wr15 \n\t"
942 "waddhus wr9, wr9, wr15 \n\t"
943 "wldrd wr12, [%[block]] \n\t"
944 "wsrlhg wr8, wr8, wcgr0 \n\t"
945 "wsrlhg wr9, wr9, wcgr0 \n\t"
946 "wpackhus wr8, wr8, wr9 \n\t"
947 "subs %[h], %[h], #2 \n\t"
948 WAVG2B" wr8, wr8, wr12 \n\t"
949 "wstrd wr8, [%[block]] \n\t"
950 "add %[block], %[block], %[line_size] \n\t"
951 "pld [%[block]] \n\t"
952 "pld [%[block], #32] \n\t"
953 "bne 1b \n\t"
954 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
955 : [line_size]"r"(line_size)
956 : "r12", "memory");
957 }
958
959 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
960 {
961 // [wr0 wr1 wr2 wr3] for previous line
962 // [wr4 wr5 wr6 wr7] for current line
963 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
964 __asm__ __volatile__(
965 "pld [%[block]] \n\t"
966 "pld [%[block], #32] \n\t"
967 "pld [%[pixels]] \n\t"
968 "mov r12, #2 \n\t"
969 "pld [%[pixels], #32] \n\t"
970 "tmcr wcgr0, r12 \n\t" /* for shift value */
971 /* alignment */
972 "and r12, %[pixels], #7 \n\t"
973 "bic %[pixels], %[pixels], #7 \n\t"
974 "tmcr wcgr1, r12 \n\t"
975 "add r12, r12, #1 \n\t"
976 "tmcr wcgr2, r12 \n\t"
977
978 // [wr0 wr1 wr2 wr3] <= *
979 // [wr4 wr5 wr6 wr7]
980 "wldrd wr12, [%[pixels]] \n\t"
981 "cmp r12, #8 \n\t"
982 "wldrd wr13, [%[pixels], #8] \n\t"
983 "wldrd wr14, [%[pixels], #16] \n\t"
984 "add %[pixels], %[pixels], %[line_size] \n\t"
985 "pld [%[pixels]] \n\t"
986 "walignr1 wr2, wr12, wr13 \n\t"
987 "pld [%[pixels], #32] \n\t"
988 "walignr1 wr3, wr13, wr14 \n\t"
989 "wmoveq wr10, wr13 \n\t"
990 "wmoveq wr11, wr14 \n\t"
991 "walignr2ne wr10, wr12, wr13 \n\t"
992 "walignr2ne wr11, wr13, wr14 \n\t"
993 "wunpckelub wr0, wr2 \n\t"
994 "wunpckehub wr1, wr2 \n\t"
995 "wunpckelub wr2, wr3 \n\t"
996 "wunpckehub wr3, wr3 \n\t"
997 "wunpckelub wr8, wr10 \n\t"
998 "wunpckehub wr9, wr10 \n\t"
999 "wunpckelub wr10, wr11 \n\t"
1000 "wunpckehub wr11, wr11 \n\t"
1001 "waddhus wr0, wr0, wr8 \n\t"
1002 "waddhus wr1, wr1, wr9 \n\t"
1003 "waddhus wr2, wr2, wr10 \n\t"
1004 "waddhus wr3, wr3, wr11 \n\t"
1005
1006 "1: \n\t"
1007 // [wr0 wr1 wr2 wr3]
1008 // [wr4 wr5 wr6 wr7] <= *
1009 "wldrd wr12, [%[pixels]] \n\t"
1010 "cmp r12, #8 \n\t"
1011 "wldrd wr13, [%[pixels], #8] \n\t"
1012 "wldrd wr14, [%[pixels], #16] \n\t"
1013 "add %[pixels], %[pixels], %[line_size] \n\t"
1014 "walignr1 wr6, wr12, wr13 \n\t"
1015 "pld [%[pixels]] \n\t"
1016 "pld [%[pixels], #32] \n\t"
1017 "walignr1 wr7, wr13, wr14 \n\t"
1018 "wmoveq wr10, wr13 \n\t"
1019 "wmoveq wr11, wr14 \n\t"
1020 "walignr2ne wr10, wr12, wr13 \n\t"
1021 "walignr2ne wr11, wr13, wr14 \n\t"
1022 "wunpckelub wr4, wr6 \n\t"
1023 "wunpckehub wr5, wr6 \n\t"
1024 "wunpckelub wr6, wr7 \n\t"
1025 "wunpckehub wr7, wr7 \n\t"
1026 "wunpckelub wr8, wr10 \n\t"
1027 "wunpckehub wr9, wr10 \n\t"
1028 "wunpckelub wr10, wr11 \n\t"
1029 "wunpckehub wr11, wr11 \n\t"
1030 "waddhus wr4, wr4, wr8 \n\t"
1031 "waddhus wr5, wr5, wr9 \n\t"
1032 "waddhus wr6, wr6, wr10 \n\t"
1033 "waddhus wr7, wr7, wr11 \n\t"
1034 "waddhus wr8, wr0, wr4 \n\t"
1035 "waddhus wr9, wr1, wr5 \n\t"
1036 "waddhus wr10, wr2, wr6 \n\t"
1037 "waddhus wr11, wr3, wr7 \n\t"
1038 "waddhus wr8, wr8, wr15 \n\t"
1039 "waddhus wr9, wr9, wr15 \n\t"
1040 "waddhus wr10, wr10, wr15 \n\t"
1041 "waddhus wr11, wr11, wr15 \n\t"
1042 "wsrlhg wr8, wr8, wcgr0 \n\t"
1043 "wsrlhg wr9, wr9, wcgr0 \n\t"
1044 "wldrd wr12, [%[block]] \n\t"
1045 "wldrd wr13, [%[block], #8] \n\t"
1046 "wsrlhg wr10, wr10, wcgr0 \n\t"
1047 "wsrlhg wr11, wr11, wcgr0 \n\t"
1048 "wpackhus wr8, wr8, wr9 \n\t"
1049 "wpackhus wr9, wr10, wr11 \n\t"
1050 WAVG2B" wr8, wr8, wr12 \n\t"
1051 WAVG2B" wr9, wr9, wr13 \n\t"
1052 "wstrd wr8, [%[block]] \n\t"
1053 "wstrd wr9, [%[block], #8] \n\t"
1054 "add %[block], %[block], %[line_size] \n\t"
1055
1056 // [wr0 wr1 wr2 wr3] <= *
1057 // [wr4 wr5 wr6 wr7]
1058 "wldrd wr12, [%[pixels]] \n\t"
1059 "pld [%[block]] \n\t"
1060 "wldrd wr13, [%[pixels], #8] \n\t"
1061 "pld [%[block], #32] \n\t"
1062 "wldrd wr14, [%[pixels], #16] \n\t"
1063 "add %[pixels], %[pixels], %[line_size] \n\t"
1064 "walignr1 wr2, wr12, wr13 \n\t"
1065 "pld [%[pixels]] \n\t"
1066 "pld [%[pixels], #32] \n\t"
1067 "walignr1 wr3, wr13, wr14 \n\t"
1068 "wmoveq wr10, wr13 \n\t"
1069 "wmoveq wr11, wr14 \n\t"
1070 "walignr2ne wr10, wr12, wr13 \n\t"
1071 "walignr2ne wr11, wr13, wr14 \n\t"
1072 "wunpckelub wr0, wr2 \n\t"
1073 "wunpckehub wr1, wr2 \n\t"
1074 "wunpckelub wr2, wr3 \n\t"
1075 "wunpckehub wr3, wr3 \n\t"
1076 "wunpckelub wr8, wr10 \n\t"
1077 "wunpckehub wr9, wr10 \n\t"
1078 "wunpckelub wr10, wr11 \n\t"
1079 "wunpckehub wr11, wr11 \n\t"
1080 "waddhus wr0, wr0, wr8 \n\t"
1081 "waddhus wr1, wr1, wr9 \n\t"
1082 "waddhus wr2, wr2, wr10 \n\t"
1083 "waddhus wr3, wr3, wr11 \n\t"
1084 "waddhus wr8, wr0, wr4 \n\t"
1085 "waddhus wr9, wr1, wr5 \n\t"
1086 "waddhus wr10, wr2, wr6 \n\t"
1087 "waddhus wr11, wr3, wr7 \n\t"
1088 "waddhus wr8, wr8, wr15 \n\t"
1089 "waddhus wr9, wr9, wr15 \n\t"
1090 "waddhus wr10, wr10, wr15 \n\t"
1091 "waddhus wr11, wr11, wr15 \n\t"
1092 "wsrlhg wr8, wr8, wcgr0 \n\t"
1093 "wsrlhg wr9, wr9, wcgr0 \n\t"
1094 "wldrd wr12, [%[block]] \n\t"
1095 "wldrd wr13, [%[block], #8] \n\t"
1096 "wsrlhg wr10, wr10, wcgr0 \n\t"
1097 "wsrlhg wr11, wr11, wcgr0 \n\t"
1098 "wpackhus wr8, wr8, wr9 \n\t"
1099 "wpackhus wr9, wr10, wr11 \n\t"
1100 WAVG2B" wr8, wr8, wr12 \n\t"
1101 WAVG2B" wr9, wr9, wr13 \n\t"
1102 "wstrd wr8, [%[block]] \n\t"
1103 "wstrd wr9, [%[block], #8] \n\t"
1104 "add %[block], %[block], %[line_size] \n\t"
1105 "subs %[h], %[h], #2 \n\t"
1106 "pld [%[block]] \n\t"
1107 "pld [%[block], #32] \n\t"
1108 "bne 1b \n\t"
1109 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
1110 : [line_size]"r"(line_size)
1111 : "r12", "memory");
1112 }