h264/x86: sign extend int stride in deblock functions
[libav.git] / libavcodec / x86 / h264_deblock_10bit.asm
CommitLineData
9f3d6ca4
JGG
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Oskar Arvidsson <oskar@irock.se>
7;* Loren Merritt <lorenm@u.washington.edu>
79793f83 8;* Fiona Glaser <fiona@x264.com>
9f3d6ca4
JGG
9;*
10;* This file is part of Libav.
11;*
12;* Libav is free software; you can redistribute it and/or
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
17;* Libav is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20;* Lesser General Public License for more details.
21;*
22;* You should have received a copy of the GNU Lesser General Public
23;* License along with Libav; if not, write to the Free Software
888fa31e 24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9f3d6ca4
JGG
25;******************************************************************************
26
04581c8c 27%include "libavutil/x86/x86util.asm"
9f3d6ca4
JGG
28
29SECTION_RODATA
30
31pw_pixel_max: times 8 dw ((1 << 10)-1)
32
33SECTION .text
34
35cextern pw_2
5705b020 36cextern pw_3
9f3d6ca4
JGG
37cextern pw_4
38
39; out: %4 = |%1-%2|-%3
40; clobbers: %5
41%macro ABS_SUB 5
42 psubusw %5, %2, %1
43 psubusw %4, %1, %2
44 por %4, %5
45 psubw %4, %3
46%endmacro
47
48; out: %4 = |%1-%2|<%3
49%macro DIFF_LT 5
50 psubusw %4, %2, %1
51 psubusw %5, %1, %2
52 por %5, %4 ; |%1-%2|
53 pxor %4, %4
54 psubw %5, %3 ; |%1-%2|-%3
55 pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
56%endmacro
57
58%macro LOAD_AB 4
59 movd %1, %3
60 movd %2, %4
61 SPLATW %1, %1
62 SPLATW %2, %2
63%endmacro
64
65; in: %2=tc reg
66; out: %1=splatted tc
67%macro LOAD_TC 2
68 movd %1, [%2]
69 punpcklbw %1, %1
70%if mmsize == 8
71 pshufw %1, %1, 0
72%else
73 pshuflw %1, %1, 01010000b
74 pshufd %1, %1, 01010000b
75%endif
76 psraw %1, 6
77%endmacro
78
79; in: %1=p1, %2=p0, %3=q0, %4=q1
80; %5=alpha, %6=beta, %7-%9=tmp
81; out: %7=mask
82%macro LOAD_MASK 9
83 ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
84 ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
85 pand %8, %9
86 ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
87 pxor %7, %7
88 pand %8, %9
89 pcmpgtw %7, %8
90%endmacro
91
92; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
93; out: %1=p0', m2=q0'
94%macro DEBLOCK_P0_Q0 7
95 psubw %3, %4
96 pxor %7, %7
97 paddw %3, [pw_4]
98 psubw %7, %5
99 psubw %6, %2, %1
100 psllw %6, 2
101 paddw %3, %6
102 psraw %3, 3
103 mova %6, [pw_pixel_max]
104 CLIPW %3, %7, %5
105 pxor %7, %7
106 paddw %1, %3
107 psubw %2, %3
108 CLIPW %1, %7, %6
109 CLIPW %2, %7, %6
110%endmacro
111
112; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
113%macro LUMA_Q1 6
114 pavgw %6, %3, %4 ; (p0+q0+1)>>1
115 paddw %1, %6
116 pxor %6, %6
117 psraw %1, 1
118 psubw %6, %5
119 psubw %1, %2
120 CLIPW %1, %6, %5
121 paddw %1, %2
122%endmacro
123
124%macro LUMA_DEBLOCK_ONE 3
125 DIFF_LT m5, %1, bm, m4, m6
126 pxor m6, m6
127 mova %3, m4
128 pcmpgtw m6, tcm
129 pand m4, tcm
130 pandn m6, m7
131 pand m4, m6
132 LUMA_Q1 m5, %2, m1, m2, m4, m6
133%endmacro
134
135%macro LUMA_H_STORE 2
136%if mmsize == 8
137 movq [r0-4], m0
138 movq [r0+r1-4], m1
139 movq [r0+r1*2-4], m2
140 movq [r0+%2-4], m3
141%else
142 movq [r0-4], m0
143 movhps [r0+r1-4], m0
144 movq [r0+r1*2-4], m1
145 movhps [%1-4], m1
146 movq [%1+r1-4], m2
147 movhps [%1+r1*2-4], m2
148 movq [%1+%2-4], m3
149 movhps [%1+r1*4-4], m3
150%endif
151%endmacro
152
a5bbb124 153%macro DEBLOCK_LUMA 0
9f3d6ca4 154;-----------------------------------------------------------------------------
55519926
DB
155; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta,
156; int8_t *tc0)
9f3d6ca4 157;-----------------------------------------------------------------------------
a5bbb124 158cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
9f3d6ca4
JGG
159 %assign pad 5*mmsize+12-(stack_offset&15)
160 %define tcm [rsp]
161 %define ms1 [rsp+mmsize]
162 %define ms2 [rsp+mmsize*2]
163 %define am [rsp+mmsize*3]
164 %define bm [rsp+mmsize*4]
156ea66c 165 movsxdifnidn r1, r1d
9f3d6ca4
JGG
166 SUB rsp, pad
167 shl r2d, 2
168 shl r3d, 2
2b140a3d 169 LOAD_AB m4, m5, r2d, r3d
9f3d6ca4
JGG
170 mov r3, 32/mmsize
171 mov r2, r0
172 sub r0, r1
173 mova am, m4
174 sub r0, r1
175 mova bm, m5
176 sub r0, r1
177.loop:
178 mova m0, [r0+r1]
179 mova m1, [r0+r1*2]
180 mova m2, [r2]
181 mova m3, [r2+r1]
182
183 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
184 LOAD_TC m6, r4
185 mova tcm, m6
186
187 mova m5, [r0]
188 LUMA_DEBLOCK_ONE m1, m0, ms1
189 mova [r0+r1], m5
190
191 mova m5, [r2+r1*2]
192 LUMA_DEBLOCK_ONE m2, m3, ms2
193 mova [r2+r1], m5
194
195 pxor m5, m5
196 mova m6, tcm
197 pcmpgtw m5, tcm
198 psubw m6, ms1
199 pandn m5, m7
200 psubw m6, ms2
201 pand m5, m6
202 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
203 mova [r0+r1*2], m1
204 mova [r2], m2
205
206 add r0, mmsize
207 add r2, mmsize
208 add r4, mmsize/8
209 dec r3
210 jg .loop
211 ADD rsp, pad
212 RET
213
a5bbb124 214cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
9f3d6ca4
JGG
215 %assign pad 7*mmsize+12-(stack_offset&15)
216 %define tcm [rsp]
217 %define ms1 [rsp+mmsize]
218 %define ms2 [rsp+mmsize*2]
219 %define p1m [rsp+mmsize*3]
220 %define p2m [rsp+mmsize*4]
221 %define am [rsp+mmsize*5]
222 %define bm [rsp+mmsize*6]
156ea66c 223 movsxdifnidn r1, r1d
9f3d6ca4
JGG
224 SUB rsp, pad
225 shl r2d, 2
226 shl r3d, 2
2b140a3d 227 LOAD_AB m4, m5, r2d, r3d
9f3d6ca4
JGG
228 mov r3, r1
229 mova am, m4
230 add r3, r1
231 mov r5, 32/mmsize
232 mova bm, m5
233 add r3, r1
234%if mmsize == 16
235 mov r2, r0
236 add r2, r3
237%endif
238.loop:
239%if mmsize == 8
240 movq m2, [r0-8] ; y q2 q1 q0
241 movq m7, [r0+0]
242 movq m5, [r0+r1-8]
243 movq m3, [r0+r1+0]
244 movq m0, [r0+r1*2-8]
245 movq m6, [r0+r1*2+0]
246 movq m1, [r0+r3-8]
247 TRANSPOSE4x4W 2, 5, 0, 1, 4
248 SWAP 2, 7
249 movq m7, [r0+r3]
250 TRANSPOSE4x4W 2, 3, 6, 7, 4
251%else
252 movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
253 movu m0, [r0+r1-8]
254 movu m2, [r0+r1*2-8]
255 movu m3, [r2-8]
256 TRANSPOSE4x4W 5, 0, 2, 3, 6
257 mova tcm, m3
258
259 movu m4, [r2+r1-8]
260 movu m1, [r2+r1*2-8]
261 movu m3, [r2+r3-8]
262 movu m7, [r2+r1*4-8]
263 TRANSPOSE4x4W 4, 1, 3, 7, 6
264
265 mova m6, tcm
266 punpcklqdq m6, m7
267 punpckhqdq m5, m4
268 SBUTTERFLY qdq, 0, 1, 7
269 SBUTTERFLY qdq, 2, 3, 7
270%endif
271
272 mova p2m, m6
273 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
274 LOAD_TC m6, r4
275 mova tcm, m6
276
277 LUMA_DEBLOCK_ONE m1, m0, ms1
278 mova p1m, m5
279
280 mova m5, p2m
281 LUMA_DEBLOCK_ONE m2, m3, ms2
282 mova p2m, m5
283
284 pxor m5, m5
285 mova m6, tcm
286 pcmpgtw m5, tcm
287 psubw m6, ms1
288 pandn m5, m7
289 psubw m6, ms2
290 pand m5, m6
291 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
292 mova m0, p1m
293 mova m3, p2m
294 TRANSPOSE4x4W 0, 1, 2, 3, 4
295 LUMA_H_STORE r2, r3
296
297 add r4, mmsize/8
298 lea r0, [r0+r1*(mmsize/2)]
299 lea r2, [r2+r1*(mmsize/2)]
300 dec r5
301 jg .loop
302 ADD rsp, pad
303 RET
304%endmacro
305
3b15a6d7 306%if ARCH_X86_64
9f3d6ca4
JGG
307; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
308; m12=alpha, m13=beta
309; out: m0=p1', m3=q1', m1=p0', m2=q0'
310; clobbers: m4, m5, m6, m7, m10, m11, m14
311%macro DEBLOCK_LUMA_INTER_SSE2 0
312 LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
313 LOAD_TC m6, r4
314 DIFF_LT m8, m1, m13, m10, m4
315 DIFF_LT m9, m2, m13, m11, m4
316 pand m6, m7
317
318 mova m14, m6
319 pxor m4, m4
320 pcmpgtw m6, m4
321 pand m6, m14
322
323 mova m5, m10
324 pand m5, m6
325 LUMA_Q1 m8, m0, m1, m2, m5, m4
326
327 mova m5, m11
328 pand m5, m6
329 LUMA_Q1 m9, m3, m1, m2, m5, m4
330
331 pxor m4, m4
332 psubw m6, m10
333 pcmpgtw m4, m14
334 pandn m4, m7
335 psubw m6, m11
336 pand m4, m6
337 DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
338
339 SWAP 0, 8
340 SWAP 3, 9
341%endmacro
342
a5bbb124
RB
343%macro DEBLOCK_LUMA_64 0
344cglobal deblock_v_luma_10, 5,5,15
9f3d6ca4
JGG
345 %define p2 m8
346 %define p1 m0
347 %define p0 m1
348 %define q0 m2
349 %define q1 m3
350 %define q2 m9
351 %define mask0 m7
352 %define mask1 m10
353 %define mask2 m11
156ea66c 354 movsxdifnidn r1, r1d
9f3d6ca4
JGG
355 shl r2d, 2
356 shl r3d, 2
2b140a3d 357 LOAD_AB m12, m13, r2d, r3d
9f3d6ca4
JGG
358 mov r2, r0
359 sub r0, r1
360 sub r0, r1
361 sub r0, r1
362 mov r3, 2
363.loop:
364 mova p2, [r0]
365 mova p1, [r0+r1]
366 mova p0, [r0+r1*2]
367 mova q0, [r2]
368 mova q1, [r2+r1]
369 mova q2, [r2+r1*2]
370 DEBLOCK_LUMA_INTER_SSE2
371 mova [r0+r1], p1
372 mova [r0+r1*2], p0
373 mova [r2], q0
374 mova [r2+r1], q1
375 add r0, mmsize
376 add r2, mmsize
377 add r4, 2
378 dec r3
379 jg .loop
380 REP_RET
381
a5bbb124 382cglobal deblock_h_luma_10, 5,7,15
156ea66c 383 movsxdifnidn r1, r1d
9f3d6ca4
JGG
384 shl r2d, 2
385 shl r3d, 2
2b140a3d 386 LOAD_AB m12, m13, r2d, r3d
9f3d6ca4
JGG
387 mov r2, r1
388 add r2, r1
389 add r2, r1
390 mov r5, r0
391 add r5, r2
392 mov r6, 2
393.loop:
394 movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
395 movu m0, [r0+r1-8]
396 movu m2, [r0+r1*2-8]
397 movu m9, [r5-8]
398 movu m5, [r5+r1-8]
399 movu m1, [r5+r1*2-8]
400 movu m3, [r5+r2-8]
401 movu m7, [r5+r1*4-8]
402
403 TRANSPOSE4x4W 8, 0, 2, 9, 10
404 TRANSPOSE4x4W 5, 1, 3, 7, 10
405
406 punpckhqdq m8, m5
407 SBUTTERFLY qdq, 0, 1, 10
408 SBUTTERFLY qdq, 2, 3, 10
409 punpcklqdq m9, m7
410
411 DEBLOCK_LUMA_INTER_SSE2
412
413 TRANSPOSE4x4W 0, 1, 2, 3, 4
414 LUMA_H_STORE r5, r2
415 add r4, 2
416 lea r0, [r0+r1*8]
417 lea r5, [r5+r1*8]
418 dec r6
419 jg .loop
420 REP_RET
421%endmacro
422
a5bbb124
RB
423INIT_XMM sse2
424DEBLOCK_LUMA_64
425INIT_XMM avx
426DEBLOCK_LUMA_64
9f3d6ca4
JGG
427%endif
428
429%macro SWAPMOVA 2
430%ifid %1
431 SWAP %1, %2
432%else
433 mova %1, %2
434%endif
435%endmacro
436
437; in: t0-t2: tmp registers
438; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
439; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
440%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
3b15a6d7 441%if ARCH_X86_64
9f3d6ca4
JGG
442 paddw t0, %3, %2
443 mova t2, %4
444 paddw t2, %3
445%else
446 mova t0, %3
447 mova t2, %4
448 paddw t0, %2
449 paddw t2, %3
450%endif
451 paddw t0, %1
452 paddw t2, t2
453 paddw t0, %5
454 paddw t2, %9
455 paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
456 paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
457
458 psrlw t2, 3
459 psrlw t1, t0, 2
460 psubw t2, %3
461 psubw t1, %2
462 pand t2, %8
463 pand t1, %8
464 paddw t2, %3
465 paddw t1, %2
466 SWAPMOVA %11, t1
467
468 psubw t1, t0, %3
469 paddw t0, t0
470 psubw t1, %5
471 psubw t0, %3
472 paddw t1, %6
473 paddw t1, %2
474 paddw t0, %6
475 psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
476 psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
477
478 pxor t0, t1
479 pxor t1, %1
480 pand t0, %8
481 pand t1, %7
482 pxor t0, t1
483 pxor t0, %1
484 SWAPMOVA %10, t0
485 SWAPMOVA %12, t2
486%endmacro
487
488%macro LUMA_INTRA_INIT 1
489 %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
490 %define t0 m4
491 %define t1 m5
492 %define t2 m6
493 %define t3 m7
494 %assign i 4
495%rep %1
496 CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
497 %assign i i+1
498%endrep
156ea66c 499 movsxdifnidn r1, r1d
9f3d6ca4
JGG
500 SUB rsp, pad
501%endmacro
502
503; in: %1-%3=tmp, %4=p2, %5=q2
504%macro LUMA_INTRA_INTER 5
505 LOAD_AB t0, t1, r2d, r3d
506 mova %1, t0
507 LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
3b15a6d7 508%if ARCH_X86_64
9f3d6ca4
JGG
509 mova %2, t0 ; mask0
510 psrlw t3, %1, 2
511%else
512 mova t3, %1
513 mova %2, t0 ; mask0
514 psrlw t3, 2
515%endif
516 paddw t3, [pw_2] ; alpha/4+2
517 DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
518 pand t2, %2
519 mova t3, %5 ; q2
520 mova %1, t2 ; mask1
521 DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
522 pand t2, %1
523 mova t3, %4 ; p2
524 mova %3, t2 ; mask1q
525 DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
526 pand t2, %1
527 mova %1, t2 ; mask1p
528%endmacro
529
530%macro LUMA_H_INTRA_LOAD 0
531%if mmsize == 8
532 movu t0, [r0-8]
533 movu t1, [r0+r1-8]
534 movu m0, [r0+r1*2-8]
535 movu m1, [r0+r4-8]
536 TRANSPOSE4x4W 4, 5, 0, 1, 2
537 mova t4, t0 ; p3
538 mova t5, t1 ; p2
539
540 movu m2, [r0]
541 movu m3, [r0+r1]
542 movu t0, [r0+r1*2]
543 movu t1, [r0+r4]
544 TRANSPOSE4x4W 2, 3, 4, 5, 6
545 mova t6, t0 ; q2
546 mova t7, t1 ; q3
547%else
548 movu t0, [r0-8]
549 movu t1, [r0+r1-8]
550 movu m0, [r0+r1*2-8]
551 movu m1, [r0+r5-8]
552 movu m2, [r4-8]
553 movu m3, [r4+r1-8]
554 movu t2, [r4+r1*2-8]
555 movu t3, [r4+r5-8]
556 TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
557 mova t4, t0 ; p3
558 mova t5, t1 ; p2
559 mova t6, t2 ; q2
560 mova t7, t3 ; q3
561%endif
562%endmacro
563
564; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
565%macro LUMA_H_INTRA_STORE 9
566%if mmsize == 8
567 TRANSPOSE4x4W %1, %2, %3, %4, %9
568 movq [r0-8], m%1
569 movq [r0+r1-8], m%2
570 movq [r0+r1*2-8], m%3
571 movq [r0+r4-8], m%4
572 movq m%1, %8
573 TRANSPOSE4x4W %5, %6, %7, %1, %9
574 movq [r0], m%5
575 movq [r0+r1], m%6
576 movq [r0+r1*2], m%7
577 movq [r0+r4], m%1
578%else
579 TRANSPOSE2x4x4W %1, %2, %3, %4, %9
580 movq [r0-8], m%1
581 movq [r0+r1-8], m%2
582 movq [r0+r1*2-8], m%3
583 movq [r0+r5-8], m%4
584 movhps [r4-8], m%1
585 movhps [r4+r1-8], m%2
586 movhps [r4+r1*2-8], m%3
587 movhps [r4+r5-8], m%4
588%ifnum %8
589 SWAP %1, %8
590%else
591 mova m%1, %8
592%endif
593 TRANSPOSE2x4x4W %5, %6, %7, %1, %9
594 movq [r0], m%5
595 movq [r0+r1], m%6
596 movq [r0+r1*2], m%7
597 movq [r0+r5], m%1
598 movhps [r4], m%5
599 movhps [r4+r1], m%6
600 movhps [r4+r1*2], m%7
601 movhps [r4+r5], m%1
602%endif
603%endmacro
604
3b15a6d7 605%if ARCH_X86_64
9f3d6ca4 606;-----------------------------------------------------------------------------
55519926
DB
607; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
608; int beta)
9f3d6ca4 609;-----------------------------------------------------------------------------
a5bbb124
RB
610%macro DEBLOCK_LUMA_INTRA_64 0
611cglobal deblock_v_luma_intra_10, 4,7,16
9f3d6ca4
JGG
612 %define t0 m1
613 %define t1 m2
614 %define t2 m4
615 %define p2 m8
616 %define p1 m9
617 %define p0 m10
618 %define q0 m11
619 %define q1 m12
620 %define q2 m13
621 %define aa m5
622 %define bb m14
156ea66c 623 movsxdifnidn r1, r1d
9f3d6ca4
JGG
624 lea r4, [r1*4]
625 lea r5, [r1*3] ; 3*stride
626 neg r4
627 add r4, r0 ; pix-4*stride
628 mov r6, 2
629 mova m0, [pw_2]
630 shl r2d, 2
631 shl r3d, 2
632 LOAD_AB aa, bb, r2d, r3d
a3df4781 633.loop:
9f3d6ca4
JGG
634 mova p2, [r4+r1]
635 mova p1, [r4+2*r1]
636 mova p0, [r4+r5]
637 mova q0, [r0]
638 mova q1, [r0+r1]
639 mova q2, [r0+2*r1]
640
641 LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
642 mova t2, aa
643 psrlw t2, 2
644 paddw t2, m0 ; alpha/4+2
645 DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
646 DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
647 DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
648 pand m6, m3
649 pand m7, m6
650 pand m6, t1
651 LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
652 LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
653 add r0, mmsize
654 add r4, mmsize
655 dec r6
656 jg .loop
657 REP_RET
658
659;-----------------------------------------------------------------------------
55519926
DB
660; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
661; int beta)
9f3d6ca4 662;-----------------------------------------------------------------------------
a5bbb124 663cglobal deblock_h_luma_intra_10, 4,7,16
9f3d6ca4
JGG
664 %define t0 m15
665 %define t1 m14
666 %define t2 m2
667 %define q3 m5
668 %define q2 m8
669 %define q1 m9
670 %define q0 m10
671 %define p0 m11
672 %define p1 m12
673 %define p2 m13
674 %define p3 m4
675 %define spill [rsp]
676 %assign pad 24-(stack_offset&15)
156ea66c 677 movsxdifnidn r1, r1d
9f3d6ca4
JGG
678 SUB rsp, pad
679 lea r4, [r1*4]
680 lea r5, [r1*3] ; 3*stride
681 add r4, r0 ; pix+4*stride
682 mov r6, 2
683 mova m0, [pw_2]
684 shl r2d, 2
685 shl r3d, 2
a3df4781 686.loop:
9f3d6ca4
JGG
687 movu q3, [r0-8]
688 movu q2, [r0+r1-8]
689 movu q1, [r0+r1*2-8]
690 movu q0, [r0+r5-8]
691 movu p0, [r4-8]
692 movu p1, [r4+r1-8]
693 movu p2, [r4+r1*2-8]
694 movu p3, [r4+r5-8]
695 TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
696
697 LOAD_AB m1, m2, r2d, r3d
698 LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
699 psrlw m1, 2
700 paddw m1, m0 ; alpha/4+2
701 DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
702 DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
703 DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
704 pand m6, m3
705 pand m7, m6
706 pand m6, t1
707
708 mova spill, q3
709 LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
710 LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
711 mova m7, spill
712
713 LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
714
715 lea r0, [r0+r1*8]
716 lea r4, [r4+r1*8]
717 dec r6
718 jg .loop
719 ADD rsp, pad
720 RET
721%endmacro
722
a5bbb124
RB
723INIT_XMM sse2
724DEBLOCK_LUMA_INTRA_64
725INIT_XMM avx
726DEBLOCK_LUMA_INTRA_64
9f3d6ca4
JGG
727
728%endif
729
a5bbb124 730%macro DEBLOCK_LUMA_INTRA 0
9f3d6ca4 731;-----------------------------------------------------------------------------
55519926
DB
732; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
733; int beta)
9f3d6ca4 734;-----------------------------------------------------------------------------
a5bbb124 735cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
9f3d6ca4
JGG
736 LUMA_INTRA_INIT 3
737 lea r4, [r1*4]
738 lea r5, [r1*3]
739 neg r4
740 add r4, r0
741 mov r6, 32/mmsize
742 shl r2d, 2
743 shl r3d, 2
744.loop:
745 mova m0, [r4+r1*2] ; p1
746 mova m1, [r4+r5] ; p0
747 mova m2, [r0] ; q0
748 mova m3, [r0+r1] ; q1
749 LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
750 LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
751 mova t3, [r0+r1*2] ; q2
752 LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
753 add r0, mmsize
754 add r4, mmsize
755 dec r6
756 jg .loop
757 ADD rsp, pad
758 RET
759
760;-----------------------------------------------------------------------------
55519926
DB
761; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
762; int beta)
9f3d6ca4 763;-----------------------------------------------------------------------------
a5bbb124 764cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
9f3d6ca4
JGG
765 LUMA_INTRA_INIT 8
766%if mmsize == 8
767 lea r4, [r1*3]
768 mov r5, 32/mmsize
769%else
770 lea r4, [r1*4]
771 lea r5, [r1*3] ; 3*stride
772 add r4, r0 ; pix+4*stride
773 mov r6, 32/mmsize
774%endif
775 shl r2d, 2
776 shl r3d, 2
777.loop:
778 LUMA_H_INTRA_LOAD
779 LUMA_INTRA_INTER t8, t9, t10, t5, t6
780
781 LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
782 mova t3, t6 ; q2
783 LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
784
785 mova m2, t4
786 mova m0, t11
787 mova m1, t5
788 mova m3, t8
789 mova m6, t6
790
791 LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
792
793 lea r0, [r0+r1*(mmsize/2)]
794%if mmsize == 8
795 dec r5
796%else
797 lea r4, [r4+r1*(mmsize/2)]
798 dec r6
799%endif
800 jg .loop
801 ADD rsp, pad
802 RET
803%endmacro
804
3b15a6d7 805%if ARCH_X86_64 == 0
26301caa 806INIT_MMX mmxext
a5bbb124
RB
807DEBLOCK_LUMA
808DEBLOCK_LUMA_INTRA
809INIT_XMM sse2
810DEBLOCK_LUMA
811DEBLOCK_LUMA_INTRA
812INIT_XMM avx
813DEBLOCK_LUMA
814DEBLOCK_LUMA_INTRA
9f3d6ca4 815%endif
5705b020
JGG
816
817; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
818; out: %1=p0', %2=q0'
819%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
820 mova %6, [pw_2]
821 paddw %6, %3
822 paddw %6, %4
823 paddw %7, %6, %2
824 paddw %6, %1
825 paddw %6, %3
826 paddw %7, %4
827 psraw %6, 2
828 psraw %7, 2
829 psubw %6, %1
830 psubw %7, %2
831 pand %6, %5
832 pand %7, %5
833 paddw %1, %6
834 paddw %2, %7
835%endmacro
836
837%macro CHROMA_V_LOAD 1
838 mova m0, [r0] ; p1
839 mova m1, [r0+r1] ; p0
840 mova m2, [%1] ; q0
841 mova m3, [%1+r1] ; q1
842%endmacro
843
844%macro CHROMA_V_STORE 0
845 mova [r0+1*r1], m1
846 mova [r0+2*r1], m2
847%endmacro
848
6c031a33
OA
849%macro CHROMA_V_LOAD_TC 2
850 movd %1, [%2]
851 punpcklbw %1, %1
852 punpcklwd %1, %1
853 psraw %1, 6
854%endmacro
855
a5bbb124 856%macro DEBLOCK_CHROMA 0
5705b020 857;-----------------------------------------------------------------------------
55519926
DB
858; void ff_deblock_v_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
859; int8_t *tc0)
5705b020 860;-----------------------------------------------------------------------------
a5bbb124 861cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
156ea66c 862 movsxdifnidn r1, r1d
5705b020
JGG
863 mov r5, r0
864 sub r0, r1
865 sub r0, r1
866 shl r2d, 2
867 shl r3d, 2
868%if mmsize < 16
869 mov r6, 16/mmsize
870.loop:
871%endif
872 CHROMA_V_LOAD r5
2b140a3d 873 LOAD_AB m4, m5, r2d, r3d
5705b020
JGG
874 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
875 pxor m4, m4
6c031a33 876 CHROMA_V_LOAD_TC m6, r4
5705b020
JGG
877 psubw m6, [pw_3]
878 pmaxsw m6, m4
879 pand m7, m6
880 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
881 CHROMA_V_STORE
882%if mmsize < 16
883 add r0, mmsize
884 add r5, mmsize
b0c4f043 885 add r4, mmsize/4
5705b020
JGG
886 dec r6
887 jg .loop
888 REP_RET
889%else
890 RET
891%endif
892
893;-----------------------------------------------------------------------------
55519926
DB
894; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha,
895; int beta)
5705b020 896;-----------------------------------------------------------------------------
a5bbb124 897cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
156ea66c 898 movsxdifnidn r1, r1d
5705b020
JGG
899 mov r4, r0
900 sub r0, r1
901 sub r0, r1
902 shl r2d, 2
903 shl r3d, 2
904%if mmsize < 16
905 mov r5, 16/mmsize
906.loop:
907%endif
908 CHROMA_V_LOAD r4
2b140a3d 909 LOAD_AB m4, m5, r2d, r3d
5705b020
JGG
910 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
911 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
912 CHROMA_V_STORE
913%if mmsize < 16
914 add r0, mmsize
915 add r4, mmsize
916 dec r5
917 jg .loop
918 REP_RET
919%else
920 RET
921%endif
922%endmacro
923
3b15a6d7 924%if ARCH_X86_64 == 0
26301caa 925INIT_MMX mmxext
a5bbb124 926DEBLOCK_CHROMA
5705b020 927%endif
a5bbb124
RB
928INIT_XMM sse2
929DEBLOCK_CHROMA
930INIT_XMM avx
931DEBLOCK_CHROMA