55790a956e9a8f17384705165b9f4b9c57e8bfdf
[libav.git] / libavcodec / x86 / h264_intrapred_10bit.asm
1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
5 ;*
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7 ;*
8 ;* This file is part of Libav.
9 ;*
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
14 ;*
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
19 ;*
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
24
25 %include "libavutil/x86/x86util.asm"
26
27 SECTION_RODATA
28
29 cextern pw_16
30 cextern pw_8
31 cextern pw_4
32 cextern pw_2
33 cextern pw_1
34
35 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
36 pw_m3: times 8 dw -3
37 pw_pixel_max: times 8 dw ((1 << 10)-1)
38 pw_512: times 8 dw 512
39 pd_17: times 4 dd 17
40 pd_16: times 4 dd 16
41
42 SECTION .text
43
44 ; dest, left, right, src
45 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46 %macro PRED4x4_LOWPASS 4
47 paddw %2, %3
48 psrlw %2, 1
49 pavgw %1, %4, %2
50 %endmacro
51
52 ;-----------------------------------------------------------------------------
53 ; void ff_pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54 ;-----------------------------------------------------------------------------
55 %macro PRED4x4_DR 0
56 cglobal pred4x4_down_right_10, 3, 3
57 sub r0, r2
58 lea r1, [r0+r2*2]
59 movhps m1, [r1-8]
60 movhps m2, [r0+r2*1-8]
61 movhps m4, [r0-8]
62 punpckhwd m2, m4
63 movq m3, [r0]
64 punpckhdq m1, m2
65 PALIGNR m3, m1, 10, m1
66 movhps m4, [r1+r2*1-8]
67 PALIGNR m0, m3, m4, 14, m4
68 movhps m4, [r1+r2*2-8]
69 PALIGNR m2, m0, m4, 14, m4
70 PRED4x4_LOWPASS m0, m2, m3, m0
71 movq [r1+r2*2], m0
72 psrldq m0, 2
73 movq [r1+r2*1], m0
74 psrldq m0, 2
75 movq [r0+r2*2], m0
76 psrldq m0, 2
77 movq [r0+r2*1], m0
78 RET
79 %endmacro
80
81 INIT_XMM sse2
82 PRED4x4_DR
83 INIT_XMM ssse3
84 PRED4x4_DR
85 INIT_XMM avx
86 PRED4x4_DR
87
88 ;------------------------------------------------------------------------------
89 ; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
90 ;------------------------------------------------------------------------------
91 %macro PRED4x4_VR 0
92 cglobal pred4x4_vertical_right_10, 3, 3, 6
93 sub r0, r2
94 lea r1, [r0+r2*2]
95 movq m5, [r0] ; ........t3t2t1t0
96 movhps m1, [r0-8]
97 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
98 pavgw m5, m0
99 movhps m1, [r0+r2*1-8]
100 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
101 movhps m2, [r0+r2*2-8]
102 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
103 movhps m3, [r1+r2*1-8]
104 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
105 PRED4x4_LOWPASS m1, m0, m2, m1
106 pslldq m0, m1, 12
107 psrldq m1, 4
108 movq [r0+r2*1], m5
109 movq [r0+r2*2], m1
110 PALIGNR m5, m0, 14, m2
111 pslldq m0, 2
112 movq [r1+r2*1], m5
113 PALIGNR m1, m0, 14, m0
114 movq [r1+r2*2], m1
115 RET
116 %endmacro
117
118 INIT_XMM sse2
119 PRED4x4_VR
120 INIT_XMM ssse3
121 PRED4x4_VR
122 INIT_XMM avx
123 PRED4x4_VR
124
125 ;-------------------------------------------------------------------------------
126 ; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
127 ;-------------------------------------------------------------------------------
128 %macro PRED4x4_HD 0
129 cglobal pred4x4_horizontal_down_10, 3, 3
130 sub r0, r2
131 lea r1, [r0+r2*2]
132 movq m0, [r0-8] ; lt ..
133 movhps m0, [r0]
134 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
135 movq m1, [r1+r2*2-8] ; l3
136 movq m3, [r1+r2*1-8]
137 punpcklwd m1, m3 ; l2 l3
138 movq m2, [r0+r2*2-8] ; l1
139 movq m3, [r0+r2*1-8]
140 punpcklwd m2, m3 ; l0 l1
141 punpckhdq m1, m2 ; l0 l1 l2 l3
142 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
143 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
144 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
145 pavgw m5, m1, m3
146 PRED4x4_LOWPASS m3, m1, m0, m3
147 punpcklwd m5, m3
148 psrldq m3, 8
149 PALIGNR m3, m5, 12, m4
150 movq [r1+r2*2], m5
151 movhps [r0+r2*2], m5
152 psrldq m5, 4
153 movq [r1+r2*1], m5
154 movq [r0+r2*1], m3
155 RET
156 %endmacro
157
158 INIT_XMM sse2
159 PRED4x4_HD
160 INIT_XMM ssse3
161 PRED4x4_HD
162 INIT_XMM avx
163 PRED4x4_HD
164
165 ;-----------------------------------------------------------------------------
166 ; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
167 ;-----------------------------------------------------------------------------
168 %macro HADDD 2 ; sum junk
169 %if mmsize == 16
170 movhlps %2, %1
171 paddd %1, %2
172 pshuflw %2, %1, 0xE
173 paddd %1, %2
174 %else
175 pshufw %2, %1, 0xE
176 paddd %1, %2
177 %endif
178 %endmacro
179
180 %macro HADDW 2
181 pmaddwd %1, [pw_1]
182 HADDD %1, %2
183 %endmacro
184
185 INIT_MMX mmxext
186 cglobal pred4x4_dc_10, 3, 3
187 sub r0, r2
188 lea r1, [r0+r2*2]
189 movq m2, [r0+r2*1-8]
190 paddw m2, [r0+r2*2-8]
191 paddw m2, [r1+r2*1-8]
192 paddw m2, [r1+r2*2-8]
193 psrlq m2, 48
194 movq m0, [r0]
195 HADDW m0, m1
196 paddw m0, [pw_4]
197 paddw m0, m2
198 psrlw m0, 3
199 SPLATW m0, m0, 0
200 movq [r0+r2*1], m0
201 movq [r0+r2*2], m0
202 movq [r1+r2*1], m0
203 movq [r1+r2*2], m0
204 RET
205
206 ;-----------------------------------------------------------------------------
207 ; void ff_pred4x4_down_left(pixel *src, const pixel *topright, int stride)
208 ;-----------------------------------------------------------------------------
209 %macro PRED4x4_DL 0
210 cglobal pred4x4_down_left_10, 3, 3
211 sub r0, r2
212 movq m0, [r0]
213 movhps m0, [r1]
214 psrldq m2, m0, 2
215 pslldq m3, m0, 2
216 pshufhw m2, m2, 10100100b
217 PRED4x4_LOWPASS m0, m3, m2, m0
218 lea r1, [r0+r2*2]
219 movhps [r1+r2*2], m0
220 psrldq m0, 2
221 movq [r0+r2*1], m0
222 psrldq m0, 2
223 movq [r0+r2*2], m0
224 psrldq m0, 2
225 movq [r1+r2*1], m0
226 RET
227 %endmacro
228
229 INIT_XMM sse2
230 PRED4x4_DL
231 INIT_XMM avx
232 PRED4x4_DL
233
234 ;-----------------------------------------------------------------------------
235 ; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
236 ;-----------------------------------------------------------------------------
237 %macro PRED4x4_VL 0
238 cglobal pred4x4_vertical_left_10, 3, 3
239 sub r0, r2
240 movu m1, [r0]
241 movhps m1, [r1]
242 psrldq m0, m1, 2
243 psrldq m2, m1, 4
244 pavgw m4, m0, m1
245 PRED4x4_LOWPASS m0, m1, m2, m0
246 lea r1, [r0+r2*2]
247 movq [r0+r2*1], m4
248 movq [r0+r2*2], m0
249 psrldq m4, 2
250 psrldq m0, 2
251 movq [r1+r2*1], m4
252 movq [r1+r2*2], m0
253 RET
254 %endmacro
255
256 INIT_XMM sse2
257 PRED4x4_VL
258 INIT_XMM avx
259 PRED4x4_VL
260
261 ;-----------------------------------------------------------------------------
262 ; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
263 ;-----------------------------------------------------------------------------
264 INIT_MMX mmxext
265 cglobal pred4x4_horizontal_up_10, 3, 3
266 sub r0, r2
267 lea r1, [r0+r2*2]
268 movq m0, [r0+r2*1-8]
269 punpckhwd m0, [r0+r2*2-8]
270 movq m1, [r1+r2*1-8]
271 punpckhwd m1, [r1+r2*2-8]
272 punpckhdq m0, m1
273 pshufw m1, m1, 0xFF
274 movq [r1+r2*2], m1
275 movd [r1+r2*1+4], m1
276 pshufw m2, m0, 11111001b
277 movq m1, m2
278 pavgw m2, m0
279
280 pshufw m5, m0, 11111110b
281 PRED4x4_LOWPASS m1, m0, m5, m1
282 movq m6, m2
283 punpcklwd m6, m1
284 movq [r0+r2*1], m6
285 psrlq m2, 16
286 psrlq m1, 16
287 punpcklwd m2, m1
288 movq [r0+r2*2], m2
289 psrlq m2, 32
290 movd [r1+r2*1], m2
291 RET
292
293
294
295 ;-----------------------------------------------------------------------------
296 ; void ff_pred8x8_vertical(pixel *src, int stride)
297 ;-----------------------------------------------------------------------------
298 INIT_XMM sse2
299 cglobal pred8x8_vertical_10, 2, 2
300 sub r0, r1
301 mova m0, [r0]
302 %rep 3
303 mova [r0+r1*1], m0
304 mova [r0+r1*2], m0
305 lea r0, [r0+r1*2]
306 %endrep
307 mova [r0+r1*1], m0
308 mova [r0+r1*2], m0
309 RET
310
311 ;-----------------------------------------------------------------------------
312 ; void ff_pred8x8_horizontal(pixel *src, int stride)
313 ;-----------------------------------------------------------------------------
314 INIT_XMM sse2
315 cglobal pred8x8_horizontal_10, 2, 3
316 mov r2d, 4
317 .loop:
318 movq m0, [r0+r1*0-8]
319 movq m1, [r0+r1*1-8]
320 pshuflw m0, m0, 0xff
321 pshuflw m1, m1, 0xff
322 punpcklqdq m0, m0
323 punpcklqdq m1, m1
324 mova [r0+r1*0], m0
325 mova [r0+r1*1], m1
326 lea r0, [r0+r1*2]
327 dec r2d
328 jg .loop
329 REP_RET
330
331 ;-----------------------------------------------------------------------------
332 ; void ff_predict_8x8_dc(pixel *src, int stride)
333 ;-----------------------------------------------------------------------------
334 %macro MOV8 2-3
335 ; sort of a hack, but it works
336 %if mmsize==8
337 movq [%1+0], %2
338 movq [%1+8], %3
339 %else
340 movdqa [%1], %2
341 %endif
342 %endmacro
343
344 %macro PRED8x8_DC 1
345 cglobal pred8x8_dc_10, 2, 6
346 sub r0, r1
347 pxor m4, m4
348 movq m0, [r0+0]
349 movq m1, [r0+8]
350 %if mmsize==16
351 punpcklwd m0, m1
352 movhlps m1, m0
353 paddw m0, m1
354 %else
355 pshufw m2, m0, 00001110b
356 pshufw m3, m1, 00001110b
357 paddw m0, m2
358 paddw m1, m3
359 punpcklwd m0, m1
360 %endif
361 %1 m2, m0, 00001110b
362 paddw m0, m2
363
364 lea r5, [r1*3]
365 lea r4, [r0+r1*4]
366 movzx r2d, word [r0+r1*1-2]
367 movzx r3d, word [r0+r1*2-2]
368 add r2d, r3d
369 movzx r3d, word [r0+r5*1-2]
370 add r2d, r3d
371 movzx r3d, word [r4-2]
372 add r2d, r3d
373 movd m2, r2d ; s2
374
375 movzx r2d, word [r4+r1*1-2]
376 movzx r3d, word [r4+r1*2-2]
377 add r2d, r3d
378 movzx r3d, word [r4+r5*1-2]
379 add r2d, r3d
380 movzx r3d, word [r4+r1*4-2]
381 add r2d, r3d
382 movd m3, r2d ; s3
383
384 punpcklwd m2, m3
385 punpckldq m0, m2 ; s0, s1, s2, s3
386 %1 m3, m0, 11110110b ; s2, s1, s3, s3
387 %1 m0, m0, 01110100b ; s0, s1, s3, s1
388 paddw m0, m3
389 psrlw m0, 2
390 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
391 %if mmsize==16
392 punpcklwd m0, m0
393 pshufd m3, m0, 11111010b
394 punpckldq m0, m0
395 SWAP 0,1
396 %else
397 pshufw m1, m0, 0x00
398 pshufw m2, m0, 0x55
399 pshufw m3, m0, 0xaa
400 pshufw m4, m0, 0xff
401 %endif
402 MOV8 r0+r1*1, m1, m2
403 MOV8 r0+r1*2, m1, m2
404 MOV8 r0+r5*1, m1, m2
405 MOV8 r0+r1*4, m1, m2
406 MOV8 r4+r1*1, m3, m4
407 MOV8 r4+r1*2, m3, m4
408 MOV8 r4+r5*1, m3, m4
409 MOV8 r4+r1*4, m3, m4
410 RET
411 %endmacro
412
413 INIT_MMX mmxext
414 PRED8x8_DC pshufw
415 INIT_XMM sse2
416 PRED8x8_DC pshuflw
417
418 ;-----------------------------------------------------------------------------
419 ; void ff_pred8x8_top_dc(pixel *src, int stride)
420 ;-----------------------------------------------------------------------------
421 INIT_XMM sse2
422 cglobal pred8x8_top_dc_10, 2, 4
423 sub r0, r1
424 mova m0, [r0]
425 pshuflw m1, m0, 0x4e
426 pshufhw m1, m1, 0x4e
427 paddw m0, m1
428 pshuflw m1, m0, 0xb1
429 pshufhw m1, m1, 0xb1
430 paddw m0, m1
431 lea r2, [r1*3]
432 lea r3, [r0+r1*4]
433 paddw m0, [pw_2]
434 psrlw m0, 2
435 mova [r0+r1*1], m0
436 mova [r0+r1*2], m0
437 mova [r0+r2*1], m0
438 mova [r0+r1*4], m0
439 mova [r3+r1*1], m0
440 mova [r3+r1*2], m0
441 mova [r3+r2*1], m0
442 mova [r3+r1*4], m0
443 RET
444
445 ;-----------------------------------------------------------------------------
446 ; void ff_pred8x8_plane(pixel *src, int stride)
447 ;-----------------------------------------------------------------------------
448 INIT_XMM sse2
449 cglobal pred8x8_plane_10, 2, 7, 7
450 sub r0, r1
451 lea r2, [r1*3]
452 lea r3, [r0+r1*4]
453 mova m2, [r0]
454 pmaddwd m2, [pw_m32101234]
455 HADDD m2, m1
456 movd m0, [r0-4]
457 psrld m0, 14
458 psubw m2, m0 ; H
459 movd m0, [r3+r1*4-4]
460 movd m1, [r0+12]
461 paddw m0, m1
462 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
463 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
464 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
465 sub r4d, r5d
466 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
467 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
468 sub r6d, r5d
469 lea r4d, [r4+r6*2]
470 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
471 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
472 sub r5d, r6d
473 lea r5d, [r5*3]
474 add r4d, r5d
475 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
476 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
477 sub r6d, r5d
478 lea r4d, [r4+r6*4]
479 movd m3, r4d ; V
480 punpckldq m2, m3
481 pmaddwd m2, [pd_17]
482 paddd m2, [pd_16]
483 psrad m2, 5 ; b, c
484
485 mova m3, [pw_pixel_max]
486 pxor m1, m1
487 SPLATW m0, m0, 1
488 SPLATW m4, m2, 2
489 SPLATW m2, m2, 0
490 pmullw m2, [pw_m32101234] ; b
491 pmullw m5, m4, [pw_m3] ; c
492 paddw m5, [pw_16]
493 mov r2d, 8
494 add r0, r1
495 .loop:
496 paddsw m6, m2, m5
497 paddsw m6, m0
498 psraw m6, 5
499 CLIPW m6, m1, m3
500 mova [r0], m6
501 paddw m5, m4
502 add r0, r1
503 dec r2d
504 jg .loop
505 REP_RET
506
507
508 ;-----------------------------------------------------------------------------
509 ; void ff_pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright,
510 ; int stride)
511 ;-----------------------------------------------------------------------------
512 %macro PRED8x8L_128_DC 0
513 cglobal pred8x8l_128_dc_10, 4, 4
514 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
515 lea r1, [r3*3]
516 lea r2, [r0+r3*4]
517 MOV8 r0+r3*0, m0, m0
518 MOV8 r0+r3*1, m0, m0
519 MOV8 r0+r3*2, m0, m0
520 MOV8 r0+r1*1, m0, m0
521 MOV8 r2+r3*0, m0, m0
522 MOV8 r2+r3*1, m0, m0
523 MOV8 r2+r3*2, m0, m0
524 MOV8 r2+r1*1, m0, m0
525 RET
526 %endmacro
527
528 INIT_MMX mmxext
529 PRED8x8L_128_DC
530 INIT_XMM sse2
531 PRED8x8L_128_DC
532
533 ;-----------------------------------------------------------------------------
534 ; void ff_pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright,
535 ; int stride)
536 ;-----------------------------------------------------------------------------
537 %macro PRED8x8L_TOP_DC 0
538 cglobal pred8x8l_top_dc_10, 4, 4, 6
539 sub r0, r3
540 mova m0, [r0]
541 shr r1d, 14
542 shr r2d, 13
543 neg r1
544 pslldq m1, m0, 2
545 psrldq m2, m0, 2
546 pinsrw m1, [r0+r1], 0
547 pinsrw m2, [r0+r2+14], 7
548 lea r1, [r3*3]
549 lea r2, [r0+r3*4]
550 PRED4x4_LOWPASS m0, m2, m1, m0
551 HADDW m0, m1
552 paddw m0, [pw_4]
553 psrlw m0, 3
554 SPLATW m0, m0, 0
555 mova [r0+r3*1], m0
556 mova [r0+r3*2], m0
557 mova [r0+r1*1], m0
558 mova [r0+r3*4], m0
559 mova [r2+r3*1], m0
560 mova [r2+r3*2], m0
561 mova [r2+r1*1], m0
562 mova [r2+r3*4], m0
563 RET
564 %endmacro
565
566 INIT_XMM sse2
567 PRED8x8L_TOP_DC
568 INIT_XMM avx
569 PRED8x8L_TOP_DC
570
571 ;-------------------------------------------------------------------------------
572 ; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
573 ;-------------------------------------------------------------------------------
574 ;TODO: see if scalar is faster
575 %macro PRED8x8L_DC 0
576 cglobal pred8x8l_dc_10, 4, 6, 6
577 sub r0, r3
578 lea r4, [r0+r3*4]
579 lea r5, [r3*3]
580 mova m0, [r0+r3*2-16]
581 punpckhwd m0, [r0+r3*1-16]
582 mova m1, [r4+r3*0-16]
583 punpckhwd m1, [r0+r5*1-16]
584 punpckhdq m1, m0
585 mova m2, [r4+r3*2-16]
586 punpckhwd m2, [r4+r3*1-16]
587 mova m3, [r4+r3*4-16]
588 punpckhwd m3, [r4+r5*1-16]
589 punpckhdq m3, m2
590 punpckhqdq m3, m1
591 mova m0, [r0]
592 shr r1d, 14
593 shr r2d, 13
594 neg r1
595 pslldq m1, m0, 2
596 psrldq m2, m0, 2
597 pinsrw m1, [r0+r1], 0
598 pinsrw m2, [r0+r2+14], 7
599 not r1
600 and r1, r3
601 pslldq m4, m3, 2
602 psrldq m5, m3, 2
603 pshuflw m4, m4, 11100101b
604 pinsrw m5, [r0+r1-2], 7
605 PRED4x4_LOWPASS m3, m4, m5, m3
606 PRED4x4_LOWPASS m0, m2, m1, m0
607 paddw m0, m3
608 HADDW m0, m1
609 paddw m0, [pw_8]
610 psrlw m0, 4
611 SPLATW m0, m0
612 mova [r0+r3*1], m0
613 mova [r0+r3*2], m0
614 mova [r0+r5*1], m0
615 mova [r0+r3*4], m0
616 mova [r4+r3*1], m0
617 mova [r4+r3*2], m0
618 mova [r4+r5*1], m0
619 mova [r4+r3*4], m0
620 RET
621 %endmacro
622
623 INIT_XMM sse2
624 PRED8x8L_DC
625 INIT_XMM avx
626 PRED8x8L_DC
627
628 ;-----------------------------------------------------------------------------
629 ; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
630 ; int stride)
631 ;-----------------------------------------------------------------------------
632 %macro PRED8x8L_VERTICAL 0
633 cglobal pred8x8l_vertical_10, 4, 4, 6
634 sub r0, r3
635 mova m0, [r0]
636 shr r1d, 14
637 shr r2d, 13
638 neg r1
639 pslldq m1, m0, 2
640 psrldq m2, m0, 2
641 pinsrw m1, [r0+r1], 0
642 pinsrw m2, [r0+r2+14], 7
643 lea r1, [r3*3]
644 lea r2, [r0+r3*4]
645 PRED4x4_LOWPASS m0, m2, m1, m0
646 mova [r0+r3*1], m0
647 mova [r0+r3*2], m0
648 mova [r0+r1*1], m0
649 mova [r0+r3*4], m0
650 mova [r2+r3*1], m0
651 mova [r2+r3*2], m0
652 mova [r2+r1*1], m0
653 mova [r2+r3*4], m0
654 RET
655 %endmacro
656
657 INIT_XMM sse2
658 PRED8x8L_VERTICAL
659 INIT_XMM avx
660 PRED8x8L_VERTICAL
661
662 ;-----------------------------------------------------------------------------
663 ; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
664 ; int stride)
665 ;-----------------------------------------------------------------------------
666 %macro PRED8x8L_HORIZONTAL 0
667 cglobal pred8x8l_horizontal_10, 4, 4, 5
668 mova m0, [r0-16]
669 shr r1d, 14
670 dec r1
671 and r1, r3
672 sub r1, r3
673 punpckhwd m0, [r0+r1-16]
674 mova m1, [r0+r3*2-16]
675 punpckhwd m1, [r0+r3*1-16]
676 lea r2, [r0+r3*4]
677 lea r1, [r3*3]
678 punpckhdq m1, m0
679 mova m2, [r2+r3*0-16]
680 punpckhwd m2, [r0+r1-16]
681 mova m3, [r2+r3*2-16]
682 punpckhwd m3, [r2+r3*1-16]
683 punpckhdq m3, m2
684 punpckhqdq m3, m1
685 PALIGNR m4, m3, [r2+r1-16], 14, m0
686 pslldq m0, m4, 2
687 pshuflw m0, m0, 11100101b
688 PRED4x4_LOWPASS m4, m3, m0, m4
689 punpckhwd m3, m4, m4
690 punpcklwd m4, m4
691 pshufd m0, m3, 0xff
692 pshufd m1, m3, 0xaa
693 pshufd m2, m3, 0x55
694 pshufd m3, m3, 0x00
695 mova [r0+r3*0], m0
696 mova [r0+r3*1], m1
697 mova [r0+r3*2], m2
698 mova [r0+r1*1], m3
699 pshufd m0, m4, 0xff
700 pshufd m1, m4, 0xaa
701 pshufd m2, m4, 0x55
702 pshufd m3, m4, 0x00
703 mova [r2+r3*0], m0
704 mova [r2+r3*1], m1
705 mova [r2+r3*2], m2
706 mova [r2+r1*1], m3
707 RET
708 %endmacro
709
710 INIT_XMM sse2
711 PRED8x8L_HORIZONTAL
712 INIT_XMM ssse3
713 PRED8x8L_HORIZONTAL
714 INIT_XMM avx
715 PRED8x8L_HORIZONTAL
716
717 ;-----------------------------------------------------------------------------
718 ; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
719 ; int stride)
720 ;-----------------------------------------------------------------------------
721 %macro PRED8x8L_DOWN_LEFT 0
722 cglobal pred8x8l_down_left_10, 4, 4, 7
723 sub r0, r3
724 mova m3, [r0]
725 shr r1d, 14
726 neg r1
727 shr r2d, 13
728 pslldq m1, m3, 2
729 psrldq m2, m3, 2
730 pinsrw m1, [r0+r1], 0
731 pinsrw m2, [r0+r2+14], 7
732 PRED4x4_LOWPASS m6, m2, m1, m3
733 jz .fix_tr ; flags from shr r2d
734 mova m1, [r0+16]
735 psrldq m5, m1, 2
736 PALIGNR m2, m1, m3, 14, m3
737 pshufhw m5, m5, 10100100b
738 PRED4x4_LOWPASS m1, m2, m5, m1
739 .do_topright:
740 lea r1, [r3*3]
741 psrldq m5, m1, 14
742 lea r2, [r0+r3*4]
743 PALIGNR m2, m1, m6, 2, m0
744 PALIGNR m3, m1, m6, 14, m0
745 PALIGNR m5, m1, 2, m0
746 pslldq m4, m6, 2
747 PRED4x4_LOWPASS m6, m4, m2, m6
748 PRED4x4_LOWPASS m1, m3, m5, m1
749 mova [r2+r3*4], m1
750 PALIGNR m1, m6, 14, m2
751 pslldq m6, 2
752 mova [r2+r1*1], m1
753 PALIGNR m1, m6, 14, m2
754 pslldq m6, 2
755 mova [r2+r3*2], m1
756 PALIGNR m1, m6, 14, m2
757 pslldq m6, 2
758 mova [r2+r3*1], m1
759 PALIGNR m1, m6, 14, m2
760 pslldq m6, 2
761 mova [r0+r3*4], m1
762 PALIGNR m1, m6, 14, m2
763 pslldq m6, 2
764 mova [r0+r1*1], m1
765 PALIGNR m1, m6, 14, m2
766 pslldq m6, 2
767 mova [r0+r3*2], m1
768 PALIGNR m1, m6, 14, m6
769 mova [r0+r3*1], m1
770 RET
771 .fix_tr:
772 punpckhwd m3, m3
773 pshufd m1, m3, 0xFF
774 jmp .do_topright
775 %endmacro
776
777 INIT_XMM sse2
778 PRED8x8L_DOWN_LEFT
779 INIT_XMM ssse3
780 PRED8x8L_DOWN_LEFT
781 INIT_XMM avx
782 PRED8x8L_DOWN_LEFT
783
784 ;-----------------------------------------------------------------------------
785 ; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
786 ; int stride)
787 ;-----------------------------------------------------------------------------
788 %macro PRED8x8L_DOWN_RIGHT 0
789 ; standard forbids this when has_topleft is false
790 ; no need to check
791 cglobal pred8x8l_down_right_10, 4, 5, 8
792 sub r0, r3
793 lea r4, [r0+r3*4]
794 lea r1, [r3*3]
795 mova m0, [r0+r3*1-16]
796 punpckhwd m0, [r0+r3*0-16]
797 mova m1, [r0+r1*1-16]
798 punpckhwd m1, [r0+r3*2-16]
799 punpckhdq m1, m0
800 mova m2, [r4+r3*1-16]
801 punpckhwd m2, [r4+r3*0-16]
802 mova m3, [r4+r1*1-16]
803 punpckhwd m3, [r4+r3*2-16]
804 punpckhdq m3, m2
805 punpckhqdq m3, m1
806 mova m0, [r4+r3*4-16]
807 mova m1, [r0]
808 PALIGNR m4, m3, m0, 14, m0
809 PALIGNR m1, m3, 2, m2
810 pslldq m0, m4, 2
811 pshuflw m0, m0, 11100101b
812 PRED4x4_LOWPASS m6, m1, m4, m3
813 PRED4x4_LOWPASS m4, m3, m0, m4
814 mova m3, [r0]
815 shr r2d, 13
816 pslldq m1, m3, 2
817 psrldq m2, m3, 2
818 pinsrw m1, [r0-2], 0
819 pinsrw m2, [r0+r2+14], 7
820 PRED4x4_LOWPASS m3, m2, m1, m3
821 PALIGNR m2, m3, m6, 2, m0
822 PALIGNR m5, m3, m6, 14, m0
823 psrldq m7, m3, 2
824 PRED4x4_LOWPASS m6, m4, m2, m6
825 PRED4x4_LOWPASS m3, m5, m7, m3
826 mova [r4+r3*4], m6
827 PALIGNR m3, m6, 14, m2
828 pslldq m6, 2
829 mova [r0+r3*1], m3
830 PALIGNR m3, m6, 14, m2
831 pslldq m6, 2
832 mova [r0+r3*2], m3
833 PALIGNR m3, m6, 14, m2
834 pslldq m6, 2
835 mova [r0+r1*1], m3
836 PALIGNR m3, m6, 14, m2
837 pslldq m6, 2
838 mova [r0+r3*4], m3
839 PALIGNR m3, m6, 14, m2
840 pslldq m6, 2
841 mova [r4+r3*1], m3
842 PALIGNR m3, m6, 14, m2
843 pslldq m6, 2
844 mova [r4+r3*2], m3
845 PALIGNR m3, m6, 14, m6
846 mova [r4+r1*1], m3
847 RET
848 %endmacro
849
850 INIT_XMM sse2
851 PRED8x8L_DOWN_RIGHT
852 INIT_XMM ssse3
853 PRED8x8L_DOWN_RIGHT
854 INIT_XMM avx
855 PRED8x8L_DOWN_RIGHT
856
857 ;-----------------------------------------------------------------------------
858 ; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
859 ; int has_topright, int stride)
860 ;-----------------------------------------------------------------------------
861 %macro PRED8x8L_VERTICAL_RIGHT 0
862 ; likewise with 8x8l_down_right
863 cglobal pred8x8l_vertical_right_10, 4, 5, 7
864 sub r0, r3
865 lea r4, [r0+r3*4]
866 lea r1, [r3*3]
867 mova m0, [r0+r3*1-16]
868 punpckhwd m0, [r0+r3*0-16]
869 mova m1, [r0+r1*1-16]
870 punpckhwd m1, [r0+r3*2-16]
871 punpckhdq m1, m0
872 mova m2, [r4+r3*1-16]
873 punpckhwd m2, [r4+r3*0-16]
874 mova m3, [r4+r1*1-16]
875 punpckhwd m3, [r4+r3*2-16]
876 punpckhdq m3, m2
877 punpckhqdq m3, m1
878 mova m0, [r4+r3*4-16]
879 mova m1, [r0]
880 PALIGNR m4, m3, m0, 14, m0
881 PALIGNR m1, m3, 2, m2
882 PRED4x4_LOWPASS m3, m1, m4, m3
883 mova m2, [r0]
884 shr r2d, 13
885 pslldq m1, m2, 2
886 psrldq m5, m2, 2
887 pinsrw m1, [r0-2], 0
888 pinsrw m5, [r0+r2+14], 7
889 PRED4x4_LOWPASS m2, m5, m1, m2
890 PALIGNR m6, m2, m3, 12, m1
891 PALIGNR m5, m2, m3, 14, m0
892 PRED4x4_LOWPASS m0, m6, m2, m5
893 pavgw m2, m5
894 mova [r0+r3*2], m0
895 mova [r0+r3*1], m2
896 pslldq m6, m3, 4
897 pslldq m1, m3, 2
898 PRED4x4_LOWPASS m1, m3, m6, m1
899 PALIGNR m2, m1, 14, m4
900 mova [r0+r1*1], m2
901 pslldq m1, 2
902 PALIGNR m0, m1, 14, m3
903 mova [r0+r3*4], m0
904 pslldq m1, 2
905 PALIGNR m2, m1, 14, m4
906 mova [r4+r3*1], m2
907 pslldq m1, 2
908 PALIGNR m0, m1, 14, m3
909 mova [r4+r3*2], m0
910 pslldq m1, 2
911 PALIGNR m2, m1, 14, m4
912 mova [r4+r1*1], m2
913 pslldq m1, 2
914 PALIGNR m0, m1, 14, m1
915 mova [r4+r3*4], m0
916 RET
917 %endmacro
918
919 INIT_XMM sse2
920 PRED8x8L_VERTICAL_RIGHT
921 INIT_XMM ssse3
922 PRED8x8L_VERTICAL_RIGHT
923 INIT_XMM avx
924 PRED8x8L_VERTICAL_RIGHT
925
926 ;-----------------------------------------------------------------------------
927 ; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
928 ; int has_topright, int stride)
929 ;-----------------------------------------------------------------------------
930 %macro PRED8x8L_HORIZONTAL_UP 0
931 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
932 mova m0, [r0+r3*0-16]
933 punpckhwd m0, [r0+r3*1-16]
934 shr r1d, 14
935 dec r1
936 and r1, r3
937 sub r1, r3
938 mova m4, [r0+r1*1-16]
939 lea r1, [r3*3]
940 lea r2, [r0+r3*4]
941 mova m1, [r0+r3*2-16]
942 punpckhwd m1, [r0+r1*1-16]
943 punpckhdq m0, m1
944 mova m2, [r2+r3*0-16]
945 punpckhwd m2, [r2+r3*1-16]
946 mova m3, [r2+r3*2-16]
947 punpckhwd m3, [r2+r1*1-16]
948 punpckhdq m2, m3
949 punpckhqdq m0, m2
950 PALIGNR m1, m0, m4, 14, m4
951 psrldq m2, m0, 2
952 pshufhw m2, m2, 10100100b
953 PRED4x4_LOWPASS m0, m1, m2, m0
954 psrldq m1, m0, 2
955 psrldq m2, m0, 4
956 pshufhw m1, m1, 10100100b
957 pshufhw m2, m2, 01010100b
958 pavgw m4, m0, m1
959 PRED4x4_LOWPASS m1, m2, m0, m1
960 punpckhwd m5, m4, m1
961 punpcklwd m4, m1
962 mova [r2+r3*0], m5
963 mova [r0+r3*0], m4
964 pshufd m0, m5, 11111001b
965 pshufd m1, m5, 11111110b
966 pshufd m2, m5, 11111111b
967 mova [r2+r3*1], m0
968 mova [r2+r3*2], m1
969 mova [r2+r1*1], m2
970 PALIGNR m2, m5, m4, 4, m0
971 PALIGNR m3, m5, m4, 8, m1
972 PALIGNR m5, m5, m4, 12, m4
973 mova [r0+r3*1], m2
974 mova [r0+r3*2], m3
975 mova [r0+r1*1], m5
976 RET
977 %endmacro
978
979 INIT_XMM sse2
980 PRED8x8L_HORIZONTAL_UP
981 INIT_XMM ssse3
982 PRED8x8L_HORIZONTAL_UP
983 INIT_XMM avx
984 PRED8x8L_HORIZONTAL_UP
985
986
987 ;-----------------------------------------------------------------------------
988 ; void ff_pred16x16_vertical(pixel *src, int stride)
989 ;-----------------------------------------------------------------------------
990 %macro MOV16 3-5
991 mova [%1+ 0], %2
992 mova [%1+mmsize], %3
993 %if mmsize==8
994 mova [%1+ 16], %4
995 mova [%1+ 24], %5
996 %endif
997 %endmacro
998
999 %macro PRED16x16_VERTICAL 0
1000 cglobal pred16x16_vertical_10, 2, 3
1001 sub r0, r1
1002 mov r2d, 8
1003 mova m0, [r0+ 0]
1004 mova m1, [r0+mmsize]
1005 %if mmsize==8
1006 mova m2, [r0+16]
1007 mova m3, [r0+24]
1008 %endif
1009 .loop:
1010 MOV16 r0+r1*1, m0, m1, m2, m3
1011 MOV16 r0+r1*2, m0, m1, m2, m3
1012 lea r0, [r0+r1*2]
1013 dec r2d
1014 jg .loop
1015 REP_RET
1016 %endmacro
1017
1018 INIT_MMX mmxext
1019 PRED16x16_VERTICAL
1020 INIT_XMM sse2
1021 PRED16x16_VERTICAL
1022
1023 ;-----------------------------------------------------------------------------
1024 ; void ff_pred16x16_horizontal(pixel *src, int stride)
1025 ;-----------------------------------------------------------------------------
1026 %macro PRED16x16_HORIZONTAL 0
1027 cglobal pred16x16_horizontal_10, 2, 3
1028 mov r2d, 8
1029 .vloop:
1030 movd m0, [r0+r1*0-4]
1031 movd m1, [r0+r1*1-4]
1032 SPLATW m0, m0, 1
1033 SPLATW m1, m1, 1
1034 MOV16 r0+r1*0, m0, m0, m0, m0
1035 MOV16 r0+r1*1, m1, m1, m1, m1
1036 lea r0, [r0+r1*2]
1037 dec r2d
1038 jg .vloop
1039 REP_RET
1040 %endmacro
1041
1042 INIT_MMX mmxext
1043 PRED16x16_HORIZONTAL
1044 INIT_XMM sse2
1045 PRED16x16_HORIZONTAL
1046
1047 ;-----------------------------------------------------------------------------
1048 ; void ff_pred16x16_dc(pixel *src, int stride)
1049 ;-----------------------------------------------------------------------------
1050 %macro PRED16x16_DC 0
1051 cglobal pred16x16_dc_10, 2, 6
1052 mov r5, r0
1053 sub r0, r1
1054 mova m0, [r0+0]
1055 paddw m0, [r0+mmsize]
1056 %if mmsize==8
1057 paddw m0, [r0+16]
1058 paddw m0, [r0+24]
1059 %endif
1060 HADDW m0, m2
1061
1062 lea r0, [r0+r1-2]
1063 movzx r3d, word [r0]
1064 movzx r4d, word [r0+r1]
1065 %rep 7
1066 lea r0, [r0+r1*2]
1067 movzx r2d, word [r0]
1068 add r3d, r2d
1069 movzx r2d, word [r0+r1]
1070 add r4d, r2d
1071 %endrep
1072 lea r3d, [r3+r4+16]
1073
1074 movd m1, r3d
1075 paddw m0, m1
1076 psrlw m0, 5
1077 SPLATW m0, m0
1078 mov r3d, 8
1079 .loop:
1080 MOV16 r5+r1*0, m0, m0, m0, m0
1081 MOV16 r5+r1*1, m0, m0, m0, m0
1082 lea r5, [r5+r1*2]
1083 dec r3d
1084 jg .loop
1085 REP_RET
1086 %endmacro
1087
1088 INIT_MMX mmxext
1089 PRED16x16_DC
1090 INIT_XMM sse2
1091 PRED16x16_DC
1092
1093 ;-----------------------------------------------------------------------------
1094 ; void ff_pred16x16_top_dc(pixel *src, int stride)
1095 ;-----------------------------------------------------------------------------
1096 %macro PRED16x16_TOP_DC 0
1097 cglobal pred16x16_top_dc_10, 2, 3
1098 sub r0, r1
1099 mova m0, [r0+0]
1100 paddw m0, [r0+mmsize]
1101 %if mmsize==8
1102 paddw m0, [r0+16]
1103 paddw m0, [r0+24]
1104 %endif
1105 HADDW m0, m2
1106
1107 SPLATW m0, m0
1108 paddw m0, [pw_8]
1109 psrlw m0, 4
1110 mov r2d, 8
1111 .loop:
1112 MOV16 r0+r1*1, m0, m0, m0, m0
1113 MOV16 r0+r1*2, m0, m0, m0, m0
1114 lea r0, [r0+r1*2]
1115 dec r2d
1116 jg .loop
1117 REP_RET
1118 %endmacro
1119
1120 INIT_MMX mmxext
1121 PRED16x16_TOP_DC
1122 INIT_XMM sse2
1123 PRED16x16_TOP_DC
1124
1125 ;-----------------------------------------------------------------------------
1126 ; void ff_pred16x16_left_dc(pixel *src, int stride)
1127 ;-----------------------------------------------------------------------------
1128 %macro PRED16x16_LEFT_DC 0
1129 cglobal pred16x16_left_dc_10, 2, 6
1130 mov r5, r0
1131
1132 sub r0, 2
1133 movzx r3d, word [r0]
1134 movzx r4d, word [r0+r1]
1135 %rep 7
1136 lea r0, [r0+r1*2]
1137 movzx r2d, word [r0]
1138 add r3d, r2d
1139 movzx r2d, word [r0+r1]
1140 add r4d, r2d
1141 %endrep
1142 lea r3d, [r3+r4+8]
1143 shr r3d, 4
1144
1145 movd m0, r3d
1146 SPLATW m0, m0
1147 mov r3d, 8
1148 .loop:
1149 MOV16 r5+r1*0, m0, m0, m0, m0
1150 MOV16 r5+r1*1, m0, m0, m0, m0
1151 lea r5, [r5+r1*2]
1152 dec r3d
1153 jg .loop
1154 REP_RET
1155 %endmacro
1156
1157 INIT_MMX mmxext
1158 PRED16x16_LEFT_DC
1159 INIT_XMM sse2
1160 PRED16x16_LEFT_DC
1161
1162 ;-----------------------------------------------------------------------------
1163 ; void ff_pred16x16_128_dc(pixel *src, int stride)
1164 ;-----------------------------------------------------------------------------
1165 %macro PRED16x16_128_DC 0
1166 cglobal pred16x16_128_dc_10, 2,3
1167 mova m0, [pw_512]
1168 mov r2d, 8
1169 .loop:
1170 MOV16 r0+r1*0, m0, m0, m0, m0
1171 MOV16 r0+r1*1, m0, m0, m0, m0
1172 lea r0, [r0+r1*2]
1173 dec r2d
1174 jg .loop
1175 REP_RET
1176 %endmacro
1177
1178 INIT_MMX mmxext
1179 PRED16x16_128_DC
1180 INIT_XMM sse2
1181 PRED16x16_128_DC