h264/x86: sign extend int stride in deblock functions
[libav.git] / libavcodec / x86 / h264_deblock.asm
CommitLineData
e27ad118 1;*****************************************************************************
8ad77b65 2;* MMX/SSE2/AVX-optimized H.264 deblocking code
e27ad118 3;*****************************************************************************
8ad77b65 4;* Copyright (C) 2005-2011 x264 project
e27ad118
JGG
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
79793f83 7;* Fiona Glaser <fiona@x264.com>
8ad77b65 8;* Oskar Arvidsson <oskar@irock.se>
e27ad118 9;*
2912e87a 10;* This file is part of Libav.
e27ad118 11;*
2912e87a 12;* Libav is free software; you can redistribute it and/or
c7b1d976
LM
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
2912e87a 17;* Libav is distributed in the hope that it will be useful,
e27ad118 18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
c7b1d976
LM
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20;* Lesser General Public License for more details.
e27ad118 21;*
c7b1d976 22;* You should have received a copy of the GNU Lesser General Public
2912e87a 23;* License along with Libav; if not, write to the Free Software
888fa31e 24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c7b1d976 25;******************************************************************************
e27ad118 26
04581c8c 27%include "libavutil/x86/x86util.asm"
e27ad118 28
b829b4ce
RB
29SECTION_RODATA
30
b93b27ed 31pb_A1: times 16 db 0xA1
b829b4ce
RB
32pb_3_1: times 4 db 3, 1
33
8ad77b65 34SECTION .text
2c166c3a
RB
35
36cextern pb_0
37cextern pb_1
38cextern pb_3
e27ad118 39
e27ad118
JGG
40; expands to [base],...,[base+7*stride]
41%define PASS8ROWS(base, base3, stride, stride3) \
42 [base], [base+stride], [base+stride*2], [base3], \
43 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
44
8ad77b65
JGG
45%define PASS8ROWS(base, base3, stride, stride3, offset) \
46 PASS8ROWS(base+offset, base3+offset, stride, stride3)
47
48; in: 8 rows of 4 bytes in %4..%11
e27ad118 49; out: 4 rows of 8 bytes in m0..m3
8ad77b65
JGG
50%macro TRANSPOSE4x8_LOAD 11
51 movh m0, %4
52 movh m2, %5
53 movh m1, %6
54 movh m3, %7
55 punpckl%1 m0, m2
56 punpckl%1 m1, m3
57 mova m2, m0
58 punpckl%2 m0, m1
59 punpckh%2 m2, m1
60
61 movh m4, %8
62 movh m6, %9
63 movh m5, %10
64 movh m7, %11
65 punpckl%1 m4, m6
66 punpckl%1 m5, m7
67 mova m6, m4
68 punpckl%2 m4, m5
69 punpckh%2 m6, m5
70
71 punpckh%3 m1, m0, m4
72 punpckh%3 m3, m2, m6
73 punpckl%3 m0, m4
74 punpckl%3 m2, m6
e27ad118
JGG
75%endmacro
76
77; in: 4 rows of 8 bytes in m0..m3
78; out: 8 rows of 4 bytes in %1..%8
8ad77b65
JGG
79%macro TRANSPOSE8x4B_STORE 8
80 punpckhdq m4, m0, m0
81 punpckhdq m5, m1, m1
82 punpckhdq m6, m2, m2
e27ad118
JGG
83
84 punpcklbw m0, m1
85 punpcklbw m2, m3
8ad77b65
JGG
86 punpcklwd m1, m0, m2
87 punpckhwd m0, m2
88 movh %1, m1
e27ad118 89 punpckhdq m1, m1
8ad77b65
JGG
90 movh %2, m1
91 movh %3, m0
92 punpckhdq m0, m0
93 movh %4, m0
e27ad118
JGG
94
95 punpckhdq m3, m3
96 punpcklbw m4, m5
97 punpcklbw m6, m3
8ad77b65
JGG
98 punpcklwd m5, m4, m6
99 punpckhwd m4, m6
100 movh %5, m5
e27ad118 101 punpckhdq m5, m5
8ad77b65
JGG
102 movh %6, m5
103 movh %7, m4
104 punpckhdq m4, m4
105 movh %8, m4
106%endmacro
107
108%macro TRANSPOSE4x8B_LOAD 8
109 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
110%endmacro
111
2c166c3a 112%macro SBUTTERFLY3 4
8ad77b65 113 punpckh%1 %4, %2, %3
e27ad118 114 punpckl%1 %2, %3
e27ad118
JGG
115%endmacro
116
117; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
118; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
119%macro TRANSPOSE6x8_MEM 9
8ad77b65 120 RESET_MM_PERMUTATION
e27ad118
JGG
121 movq m0, %1
122 movq m1, %2
123 movq m2, %3
124 movq m3, %4
125 movq m4, %5
126 movq m5, %6
127 movq m6, %7
8ad77b65
JGG
128 SBUTTERFLY bw, 0, 1, 7
129 SBUTTERFLY bw, 2, 3, 7
130 SBUTTERFLY bw, 4, 5, 7
131 movq [%9+0x10], m3
132 SBUTTERFLY3 bw, m6, %8, m7
133 SBUTTERFLY wd, 0, 2, 3
134 SBUTTERFLY wd, 4, 6, 3
e27ad118
JGG
135 punpckhdq m0, m4
136 movq [%9+0x00], m0
8ad77b65
JGG
137 SBUTTERFLY3 wd, m1, [%9+0x10], m3
138 SBUTTERFLY wd, 5, 7, 0
139 SBUTTERFLY dq, 1, 5, 0
140 SBUTTERFLY dq, 2, 6, 0
141 punpckldq m3, m7
142 movq [%9+0x10], m2
143 movq [%9+0x20], m6
144 movq [%9+0x30], m1
145 movq [%9+0x40], m5
146 movq [%9+0x50], m3
147 RESET_MM_PERMUTATION
e27ad118
JGG
148%endmacro
149
150; in: 8 rows of 8 in %1..%8
151; out: 8 rows of 8 in %9..%16
152%macro TRANSPOSE8x8_MEM 16
8ad77b65 153 RESET_MM_PERMUTATION
e27ad118
JGG
154 movq m0, %1
155 movq m1, %2
156 movq m2, %3
157 movq m3, %4
158 movq m4, %5
159 movq m5, %6
160 movq m6, %7
8ad77b65
JGG
161 SBUTTERFLY bw, 0, 1, 7
162 SBUTTERFLY bw, 2, 3, 7
163 SBUTTERFLY bw, 4, 5, 7
164 SBUTTERFLY3 bw, m6, %8, m7
165 movq %9, m5
166 SBUTTERFLY wd, 0, 2, 5
167 SBUTTERFLY wd, 4, 6, 5
168 SBUTTERFLY wd, 1, 3, 5
169 movq %11, m6
170 movq m6, %9
171 SBUTTERFLY wd, 6, 7, 5
172 SBUTTERFLY dq, 0, 4, 5
173 SBUTTERFLY dq, 1, 6, 5
e27ad118 174 movq %9, m0
8ad77b65
JGG
175 movq %10, m4
176 movq %13, m1
177 movq %14, m6
178 SBUTTERFLY3 dq, m2, %11, m0
179 SBUTTERFLY dq, 3, 7, 4
180 movq %11, m2
e27ad118 181 movq %12, m0
8ad77b65
JGG
182 movq %15, m3
183 movq %16, m7
184 RESET_MM_PERMUTATION
e27ad118
JGG
185%endmacro
186
187; out: %4 = |%1-%2|>%3
188; clobbers: %5
189%macro DIFF_GT 5
8ad77b65 190%if avx_enabled == 0
e27ad118
JGG
191 mova %5, %2
192 mova %4, %1
193 psubusb %5, %1
194 psubusb %4, %2
8ad77b65
JGG
195%else
196 psubusb %5, %2, %1
197 psubusb %4, %1, %2
198%endif
e27ad118
JGG
199 por %4, %5
200 psubusb %4, %3
201%endmacro
202
203; out: %4 = |%1-%2|>%3
204; clobbers: %5
205%macro DIFF_GT2 5
3b15a6d7 206%if ARCH_X86_64
8ad77b65
JGG
207 psubusb %5, %2, %1
208 psubusb %4, %1, %2
209%else
e27ad118
JGG
210 mova %5, %2
211 mova %4, %1
212 psubusb %5, %1
213 psubusb %4, %2
8ad77b65 214%endif
e27ad118
JGG
215 psubusb %5, %3
216 psubusb %4, %3
217 pcmpeqb %4, %5
218%endmacro
219
e27ad118
JGG
220; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
221; out: m5=beta-1, m7=mask, %3=alpha-1
222; clobbers: m4,m6
223%macro LOAD_MASK 2-3
224 movd m4, %1
225 movd m5, %2
8ad77b65
JGG
226 SPLATW m4, m4
227 SPLATW m5, m5
e27ad118
JGG
228 packuswb m4, m4 ; 16x alpha-1
229 packuswb m5, m5 ; 16x beta-1
230%if %0>2
231 mova %3, m4
232%endif
233 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
234 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
235 por m7, m4
236 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
237 por m7, m4
238 pxor m6, m6
239 pcmpeqb m7, m6
240%endmacro
241
242; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
243; out: m1=p0' m2=q0'
244; clobbers: m0,3-6
245%macro DEBLOCK_P0_Q0 0
e27ad118 246 pcmpeqb m4, m4
a3bf7b86 247 pxor m5, m1, m2 ; p0^q0
e27ad118 248 pxor m3, m4
a3bf7b86 249 pand m5, [pb_1] ; (p0^q0)&1
2c166c3a 250 pavgb m3, m0 ; (p1 - q1 + 256)>>1
e27ad118 251 pxor m4, m1
a3bf7b86 252 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
2c166c3a 253 pavgb m4, m2 ; (q0 - p0 + 256)>>1
e27ad118 254 pavgb m3, m5
2c166c3a 255 mova m6, [pb_A1]
a3bf7b86 256 paddusb m3, m4 ; d+128+33
e27ad118 257 psubusb m6, m3
2c166c3a 258 psubusb m3, [pb_A1]
e27ad118
JGG
259 pminub m6, m7
260 pminub m3, m7
261 psubusb m1, m6
262 psubusb m2, m3
263 paddusb m1, m3
264 paddusb m2, m6
265%endmacro
266
267; in: m1=p0 m2=q0
268; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
269; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
270; clobbers: q2, tmp, tc0
271%macro LUMA_Q1 6
8ad77b65 272 pavgb %6, m1, m2
2c166c3a 273 pavgb %2, %6 ; avg(p2,avg(p0,q0))
e27ad118 274 pxor %6, %3
2c166c3a
RB
275 pand %6, [pb_1] ; (p2^avg(p0,q0))&1
276 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
8ad77b65 277 psubusb %6, %1, %5
e27ad118
JGG
278 paddusb %5, %1
279 pmaxub %2, %6
280 pminub %2, %5
281 mova %4, %2
282%endmacro
283
3b15a6d7 284%if ARCH_X86_64
e27ad118 285;-----------------------------------------------------------------------------
55519926
DB
286; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta,
287; int8_t *tc0)
e27ad118 288;-----------------------------------------------------------------------------
a5bbb124
RB
289%macro DEBLOCK_LUMA 0
290cglobal deblock_v_luma_8, 5,5,10
156ea66c 291 movsxdifnidn r1, r1d
e27ad118
JGG
292 movd m8, [r4] ; tc0
293 lea r4, [r1*3]
294 dec r2d ; alpha-1
295 neg r4
296 dec r3d ; beta-1
297 add r4, r0 ; pix-3*stride
298
299 mova m0, [r4+r1] ; p1
300 mova m1, [r4+2*r1] ; p0
301 mova m2, [r0] ; q0
302 mova m3, [r0+r1] ; q1
303 LOAD_MASK r2d, r3d
304
305 punpcklbw m8, m8
306 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
307 pcmpeqb m9, m9
308 pcmpeqb m9, m8
309 pandn m9, m7
310 pand m8, m9
311
312 movdqa m3, [r4] ; p2
313 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
314 pand m6, m9
8ad77b65 315 psubb m7, m8, m6
e27ad118
JGG
316 pand m6, m8
317 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
318
319 movdqa m4, [r0+2*r1] ; q2
320 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
321 pand m6, m9
322 pand m8, m6
323 psubb m7, m6
324 mova m3, [r0+r1]
325 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
326
327 DEBLOCK_P0_Q0
328 mova [r4+2*r1], m1
329 mova [r0], m2
3f87f39c 330 RET
e27ad118
JGG
331
332;-----------------------------------------------------------------------------
55519926
DB
333; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
334; int8_t *tc0)
e27ad118 335;-----------------------------------------------------------------------------
a5bbb124 336INIT_MMX cpuname
bbe4a6db 337cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
729f90e2 338 movsxd r7, r1d
156ea66c 339 movsxdifnidn r1, r1d
729f90e2 340 lea r8, [r7+r7*2]
3f87f39c 341 lea r6, [r0-4]
729f90e2 342 lea r5, [r0-4+r8]
3b15a6d7 343%if WIN64
bbe4a6db 344 %define pix_tmp rsp+0x30 ; shadow space + r4
3f87f39c 345%else
e27ad118 346 %define pix_tmp rsp
3f87f39c 347%endif
e27ad118
JGG
348
349 ; transpose 6x16 -> tmp space
729f90e2
HG
350 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
351 lea r6, [r6+r7*8]
352 lea r5, [r5+r7*8]
353 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
e27ad118
JGG
354
355 ; vertical filter
356 ; alpha, beta, tc0 are still in r2d, r3d, r4
729f90e2 357 ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
e27ad118 358 lea r0, [pix_tmp+0x30]
3f87f39c 359 mov r1d, 0x10
3b15a6d7 360%if WIN64
3f87f39c
JA
361 mov [rsp+0x20], r4
362%endif
a5bbb124 363 call deblock_v_luma_8
e27ad118
JGG
364
365 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
3f87f39c
JA
366 add r6, 2
367 add r5, 2
e27ad118
JGG
368 movq m0, [pix_tmp+0x18]
369 movq m1, [pix_tmp+0x28]
370 movq m2, [pix_tmp+0x38]
371 movq m3, [pix_tmp+0x48]
729f90e2 372 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
e27ad118 373
729f90e2
HG
374 shl r7, 3
375 sub r6, r7
376 sub r5, r7
377 shr r7, 3
e27ad118
JGG
378 movq m0, [pix_tmp+0x10]
379 movq m1, [pix_tmp+0x20]
380 movq m2, [pix_tmp+0x30]
381 movq m3, [pix_tmp+0x40]
729f90e2 382 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
e27ad118 383
3f87f39c 384 RET
8ad77b65
JGG
385%endmacro
386
a5bbb124
RB
387INIT_XMM sse2
388DEBLOCK_LUMA
389INIT_XMM avx
390DEBLOCK_LUMA
e27ad118
JGG
391
392%else
393
a5bbb124 394%macro DEBLOCK_LUMA 2
e27ad118 395;-----------------------------------------------------------------------------
55519926
DB
396; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta,
397; int8_t *tc0)
e27ad118 398;-----------------------------------------------------------------------------
6f40e9f0 399cglobal deblock_%1_luma_8, 5,5,8,2*%2
156ea66c 400 movsxdifnidn r1, r1d
e27ad118
JGG
401 lea r4, [r1*3]
402 dec r2 ; alpha-1
403 neg r4
404 dec r3 ; beta-1
405 add r4, r0 ; pix-3*stride
e27ad118
JGG
406
407 mova m0, [r4+r1] ; p1
408 mova m1, [r4+2*r1] ; p0
409 mova m2, [r0] ; q0
410 mova m3, [r0+r1] ; q1
411 LOAD_MASK r2, r3
412
3f87f39c 413 mov r3, r4mp
a3bf7b86 414 pcmpeqb m3, m3
e27ad118
JGG
415 movd m4, [r3] ; tc0
416 punpcklbw m4, m4
417 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
a5bbb124 418 mova [esp+%2], m4 ; tc
e27ad118 419 pcmpgtb m4, m3
a3bf7b86 420 mova m3, [r4] ; p2
e27ad118
JGG
421 pand m4, m7
422 mova [esp], m4 ; mask
423
e27ad118
JGG
424 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
425 pand m6, m4
a5bbb124 426 pand m4, [esp+%2] ; tc
8ad77b65 427 psubb m7, m4, m6
e27ad118
JGG
428 pand m6, m4
429 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
430
431 mova m4, [r0+2*r1] ; q2
432 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
a3bf7b86 433 pand m6, [esp] ; mask
a5bbb124 434 mova m5, [esp+%2] ; tc
e27ad118 435 psubb m7, m6
a3bf7b86 436 pand m5, m6
e27ad118
JGG
437 mova m3, [r0+r1]
438 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
439
440 DEBLOCK_P0_Q0
441 mova [r4+2*r1], m1
442 mova [r0], m2
e27ad118
JGG
443 RET
444
445;-----------------------------------------------------------------------------
55519926
DB
446; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
447; int8_t *tc0)
e27ad118 448;-----------------------------------------------------------------------------
a5bbb124 449INIT_MMX cpuname
9f1245eb 450cglobal deblock_h_luma_8, 0,5,8,0x60+12
156ea66c 451 movsxdifnidn r1, r1d
3f87f39c 452 mov r0, r0mp
e27ad118
JGG
453 mov r3, r1m
454 lea r4, [r3*3]
455 sub r0, 4
456 lea r1, [r0+r4]
9f1245eb 457%define pix_tmp esp+12
e27ad118
JGG
458
459 ; transpose 6x16 -> tmp space
460 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
461 lea r0, [r0+r3*8]
462 lea r1, [r1+r3*8]
463 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
464
465 ; vertical filter
466 lea r0, [pix_tmp+0x30]
467 PUSH dword r4m
468 PUSH dword r3m
469 PUSH dword r2m
470 PUSH dword 16
471 PUSH dword r0
a5bbb124
RB
472 call deblock_%1_luma_8
473%ifidn %1, v8
e27ad118
JGG
474 add dword [esp ], 8 ; pix_tmp+0x38
475 add dword [esp+16], 2 ; tc0+2
a5bbb124 476 call deblock_%1_luma_8
e27ad118
JGG
477%endif
478 ADD esp, 20
479
480 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
3f87f39c 481 mov r0, r0mp
e27ad118 482 sub r0, 2
e27ad118
JGG
483
484 movq m0, [pix_tmp+0x10]
485 movq m1, [pix_tmp+0x20]
a3bf7b86 486 lea r1, [r0+r4]
e27ad118
JGG
487 movq m2, [pix_tmp+0x30]
488 movq m3, [pix_tmp+0x40]
8ad77b65 489 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
e27ad118
JGG
490
491 lea r0, [r0+r3*8]
492 lea r1, [r1+r3*8]
493 movq m0, [pix_tmp+0x18]
494 movq m1, [pix_tmp+0x28]
495 movq m2, [pix_tmp+0x38]
496 movq m3, [pix_tmp+0x48]
8ad77b65 497 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
e27ad118 498
e27ad118
JGG
499 RET
500%endmacro ; DEBLOCK_LUMA
501
26301caa 502INIT_MMX mmxext
a5bbb124
RB
503DEBLOCK_LUMA v8, 8
504INIT_XMM sse2
505DEBLOCK_LUMA v, 16
506INIT_XMM avx
507DEBLOCK_LUMA v, 16
e27ad118
JGG
508
509%endif ; ARCH
510
511
512
513%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
3b15a6d7 514%if ARCH_X86_64
8ad77b65
JGG
515 pavgb t0, p2, p1
516 pavgb t1, p0, q0
517%else
e27ad118
JGG
518 mova t0, p2
519 mova t1, p0
520 pavgb t0, p1
521 pavgb t1, q0
8ad77b65 522%endif
e27ad118
JGG
523 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
524 mova t5, t1
3b15a6d7 525%if ARCH_X86_64
8ad77b65
JGG
526 paddb t2, p2, p1
527 paddb t3, p0, q0
528%else
e27ad118
JGG
529 mova t2, p2
530 mova t3, p0
531 paddb t2, p1
532 paddb t3, q0
8ad77b65 533%endif
e27ad118
JGG
534 paddb t2, t3
535 mova t3, t2
536 mova t4, t2
537 psrlw t2, 1
2c166c3a 538 pavgb t2, mpb_0
e27ad118 539 pxor t2, t0
2c166c3a 540 pand t2, mpb_1
e27ad118
JGG
541 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
542
3b15a6d7 543%if ARCH_X86_64
8ad77b65
JGG
544 pavgb t1, p2, q1
545 psubb t2, p2, q1
546%else
e27ad118
JGG
547 mova t1, p2
548 mova t2, p2
549 pavgb t1, q1
550 psubb t2, q1
8ad77b65 551%endif
e27ad118
JGG
552 paddb t3, t3
553 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
2c166c3a 554 pand t2, mpb_1
e27ad118
JGG
555 psubb t1, t2
556 pavgb t1, p1
557 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
558 psrlw t3, 2
2c166c3a 559 pavgb t3, mpb_0
e27ad118 560 pxor t3, t1
2c166c3a 561 pand t3, mpb_1
e27ad118
JGG
562 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
563
8ad77b65
JGG
564 pxor t3, p0, q1
565 pavgb t2, p0, q1
2c166c3a 566 pand t3, mpb_1
e27ad118
JGG
567 psubb t2, t3
568 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
569
570 pxor t1, t2
571 pxor t2, p0
572 pand t1, mask1p
573 pand t2, mask0
574 pxor t1, t2
575 pxor t1, p0
576 mova %1, t1 ; store p0
577
578 mova t1, %4 ; p3
8ad77b65 579 paddb t2, t1, p2
e27ad118 580 pavgb t1, p2
e27ad118
JGG
581 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
582 paddb t2, t2
583 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
584 psrlw t2, 2
2c166c3a 585 pavgb t2, mpb_0
e27ad118 586 pxor t2, t1
2c166c3a 587 pand t2, mpb_1
e27ad118
JGG
588 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
589
590 pxor t0, p1
591 pxor t1, p2
592 pand t0, mask1p
593 pand t1, mask1p
594 pxor t0, p1
595 pxor t1, p2
596 mova %2, t0 ; store p1
597 mova %3, t1 ; store p2
598%endmacro
599
600%macro LUMA_INTRA_SWAP_PQ 0
601 %define q1 m0
602 %define q0 m1
603 %define p0 m2
604 %define p1 m3
605 %define p2 q2
606 %define mask1p mask1q
607%endmacro
608
a5bbb124 609%macro DEBLOCK_LUMA_INTRA 1
e27ad118
JGG
610 %define p1 m0
611 %define p0 m1
612 %define q0 m2
613 %define q1 m3
614 %define t0 m4
615 %define t1 m5
616 %define t2 m6
617 %define t3 m7
3b15a6d7 618%if ARCH_X86_64
e27ad118
JGG
619 %define p2 m8
620 %define q2 m9
621 %define t4 m10
622 %define t5 m11
623 %define mask0 m12
624 %define mask1p m13
311443f6
MW
625%if WIN64
626 %define mask1q [rsp]
627%else
e27ad118 628 %define mask1q [rsp-24]
311443f6 629%endif
2c166c3a
RB
630 %define mpb_0 m14
631 %define mpb_1 m15
e27ad118 632%else
6f40e9f0 633 %define spill(x) [esp+16*x]
e27ad118
JGG
634 %define p2 [r4+r1]
635 %define q2 [r0+2*r1]
636 %define t4 spill(0)
637 %define t5 spill(1)
638 %define mask0 spill(2)
639 %define mask1p spill(3)
640 %define mask1q spill(4)
2c166c3a
RB
641 %define mpb_0 [pb_0]
642 %define mpb_1 [pb_1]
e27ad118
JGG
643%endif
644
645;-----------------------------------------------------------------------------
55519926 646; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
e27ad118 647;-----------------------------------------------------------------------------
311443f6
MW
648%if WIN64
649cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
650%else
6f40e9f0 651cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
311443f6 652%endif
156ea66c 653 movsxdifnidn r1, r1d
e27ad118
JGG
654 lea r4, [r1*4]
655 lea r5, [r1*3] ; 3*stride
656 dec r2d ; alpha-1
657 jl .end
658 neg r4
659 dec r3d ; beta-1
660 jl .end
661 add r4, r0 ; pix-4*stride
662 mova p1, [r4+2*r1]
663 mova p0, [r4+r5]
664 mova q0, [r0]
665 mova q1, [r0+r1]
3b15a6d7 666%if ARCH_X86_64
2c166c3a
RB
667 pxor mpb_0, mpb_0
668 mova mpb_1, [pb_1]
e27ad118
JGG
669 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
670 SWAP 7, 12 ; m12=mask0
2c166c3a
RB
671 pavgb t5, mpb_0
672 pavgb t5, mpb_1 ; alpha/4+1
e27ad118
JGG
673 movdqa p2, [r4+r1]
674 movdqa q2, [r0+2*r1]
675 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
676 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
677 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
678 pand t0, mask0
679 pand t4, t0
680 pand t2, t0
681 mova mask1q, t4
682 mova mask1p, t2
683%else
684 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
685 mova m4, t5
686 mova mask0, m7
2c166c3a
RB
687 pavgb m4, [pb_0]
688 pavgb m4, [pb_1] ; alpha/4+1
e27ad118
JGG
689 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
690 pand m6, mask0
691 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
692 pand m4, m6
693 mova mask1p, m4
694 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
695 pand m4, m6
696 mova mask1q, m4
697%endif
698 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
699 LUMA_INTRA_SWAP_PQ
700 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
701.end:
e27ad118
JGG
702 RET
703
a5bbb124 704INIT_MMX cpuname
3b15a6d7 705%if ARCH_X86_64
e27ad118 706;-----------------------------------------------------------------------------
55519926 707; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
e27ad118 708;-----------------------------------------------------------------------------
bbe4a6db 709cglobal deblock_h_luma_intra_8, 4,9,0,0x80
729f90e2 710 movsxd r7, r1d
156ea66c 711 movsxdifnidn r1, r1d
729f90e2 712 lea r8, [r7*3]
3f87f39c 713 lea r6, [r0-4]
729f90e2 714 lea r5, [r0-4+r8]
bbe4a6db
HG
715%if WIN64
716 %define pix_tmp rsp+0x20 ; shadow space
717%else
e27ad118 718 %define pix_tmp rsp
bbe4a6db 719%endif
e27ad118
JGG
720
721 ; transpose 8x16 -> tmp space
729f90e2
HG
722 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
723 lea r6, [r6+r7*8]
724 lea r5, [r5+r7*8]
725 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
e27ad118
JGG
726
727 lea r0, [pix_tmp+0x40]
728 mov r1, 0x10
a5bbb124 729 call deblock_v_luma_intra_8
e27ad118
JGG
730
731 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
729f90e2
HG
732 lea r5, [r6+r8]
733 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
734 shl r7, 3
735 sub r6, r7
736 sub r5, r7
737 shr r7, 3
738 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
3f87f39c 739 RET
e27ad118 740%else
6f40e9f0 741cglobal deblock_h_luma_intra_8, 2,4,8,0x80
e27ad118
JGG
742 lea r3, [r1*3]
743 sub r0, 4
744 lea r2, [r0+r3]
e27ad118
JGG
745 %define pix_tmp rsp
746
747 ; transpose 8x16 -> tmp space
748 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
749 lea r0, [r0+r1*8]
750 lea r2, [r2+r1*8]
751 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
752
753 lea r0, [pix_tmp+0x40]
754 PUSH dword r3m
755 PUSH dword r2m
756 PUSH dword 16
757 PUSH r0
a5bbb124
RB
758 call deblock_%1_luma_intra_8
759%ifidn %1, v8
e27ad118 760 add dword [rsp], 8 ; pix_tmp+8
a5bbb124 761 call deblock_%1_luma_intra_8
e27ad118
JGG
762%endif
763 ADD esp, 16
764
765 mov r1, r1m
3f87f39c 766 mov r0, r0mp
e27ad118
JGG
767 lea r3, [r1*3]
768 sub r0, 4
769 lea r2, [r0+r3]
770 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
771 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
772 lea r0, [r0+r1*8]
773 lea r2, [r2+r1*8]
774 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
e27ad118
JGG
775 RET
776%endif ; ARCH_X86_64
777%endmacro ; DEBLOCK_LUMA_INTRA
778
a5bbb124
RB
779INIT_XMM sse2
780DEBLOCK_LUMA_INTRA v
781INIT_XMM avx
782DEBLOCK_LUMA_INTRA v
3b15a6d7 783%if ARCH_X86_64 == 0
26301caa 784INIT_MMX mmxext
a5bbb124 785DEBLOCK_LUMA_INTRA v8
e27ad118 786%endif
2c166c3a 787
26301caa 788INIT_MMX mmxext
2c166c3a
RB
789
790%macro CHROMA_V_START 0
156ea66c 791 movsxdifnidn r1, r1d
2c166c3a
RB
792 dec r2d ; alpha-1
793 dec r3d ; beta-1
794 mov t5, r0
795 sub t5, r1
796 sub t5, r1
797%endmacro
798
799%macro CHROMA_H_START 0
156ea66c 800 movsxdifnidn r1, r1d
2c166c3a
RB
801 dec r2d
802 dec r3d
803 sub r0, 2
804 lea t6, [r1*3]
805 mov t5, r0
806 add r0, t6
807%endmacro
808
809%define t5 r5
810%define t6 r6
811
812;-----------------------------------------------------------------------------
55519926
DB
813; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta,
814; int8_t *tc0)
2c166c3a 815;-----------------------------------------------------------------------------
a5bbb124 816cglobal deblock_v_chroma_8, 5,6
2c166c3a
RB
817 CHROMA_V_START
818 movq m0, [t5]
819 movq m1, [t5+r1]
820 movq m2, [r0]
821 movq m3, [r0+r1]
26301caa 822 call ff_chroma_inter_body_mmxext
2c166c3a
RB
823 movq [t5+r1], m1
824 movq [r0], m2
825 RET
826
827;-----------------------------------------------------------------------------
55519926
DB
828; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta,
829; int8_t *tc0)
2c166c3a 830;-----------------------------------------------------------------------------
a5bbb124 831cglobal deblock_h_chroma_8, 5,7
570d4b21
MS
832%if ARCH_X86_64
833 ; This could use the red zone on 64 bit unix to avoid the stack pointer
834 ; readjustment, but valgrind assumes the red zone is clobbered on
835 ; function calls and returns.
8fb26950
RB
836 sub rsp, 16
837 %define buf0 [rsp]
838 %define buf1 [rsp+8]
2c166c3a
RB
839%else
840 %define buf0 r0m
841 %define buf1 r2m
842%endif
843 CHROMA_H_START
8ad77b65 844 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
2c166c3a
RB
845 movq buf0, m0
846 movq buf1, m3
26301caa 847 call ff_chroma_inter_body_mmxext
2c166c3a
RB
848 movq m0, buf0
849 movq m3, buf1
8ad77b65 850 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
570d4b21 851%if ARCH_X86_64
8fb26950
RB
852 add rsp, 16
853%endif
2c166c3a
RB
854 RET
855
856ALIGN 16
26301caa 857ff_chroma_inter_body_mmxext:
2c166c3a
RB
858 LOAD_MASK r2d, r3d
859 movd m6, [r4] ; tc0
860 punpcklbw m6, m6
861 pand m7, m6
862 DEBLOCK_P0_Q0
863 ret
864
865
866
867; in: %1=p0 %2=p1 %3=q1
868; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
869%macro CHROMA_INTRA_P0 3
870 movq m4, %1
871 pxor m4, %3
872 pand m4, [pb_1] ; m4 = (p0^q1)&1
873 pavgb %1, %3
874 psubusb %1, m4
875 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
876%endmacro
877
878%define t5 r4
879%define t6 r5
880
55519926
DB
881;------------------------------------------------------------------------------
882; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
883;------------------------------------------------------------------------------
a5bbb124 884cglobal deblock_v_chroma_intra_8, 4,5
2c166c3a
RB
885 CHROMA_V_START
886 movq m0, [t5]
887 movq m1, [t5+r1]
888 movq m2, [r0]
889 movq m3, [r0+r1]
26301caa 890 call ff_chroma_intra_body_mmxext
2c166c3a
RB
891 movq [t5+r1], m1
892 movq [r0], m2
893 RET
894
55519926
DB
895;------------------------------------------------------------------------------
896; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
897;------------------------------------------------------------------------------
a5bbb124 898cglobal deblock_h_chroma_intra_8, 4,6
2c166c3a 899 CHROMA_H_START
8ad77b65 900 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
26301caa 901 call ff_chroma_intra_body_mmxext
8ad77b65 902 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
2c166c3a
RB
903 RET
904
905ALIGN 16
26301caa 906ff_chroma_intra_body_mmxext:
2c166c3a
RB
907 LOAD_MASK r2d, r3d
908 movq m5, m1
909 movq m6, m2
910 CHROMA_INTRA_P0 m1, m0, m3
911 CHROMA_INTRA_P0 m2, m3, m0
912 psubb m1, m5
913 psubb m2, m6
914 pand m1, m7
915 pand m2, m7
916 paddb m1, m5
917 paddb m2, m6
918 ret
b829b4ce
RB
919
920;-----------------------------------------------------------------------------
55519926
DB
921; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
922; int8_t ref[2][40], int16_t mv[2][40][2],
923; int bidir, int edges, int step,
924; int mask_mv0, int mask_mv1, int field);
b829b4ce
RB
925;
926; bidir is 0 or 1
927; edges is 1 or 4
928; step is 1 or 2
929; mask_mv0 is 0 or 3
930; mask_mv1 is 0 or 1
931; field is 0 or 1
932;-----------------------------------------------------------------------------
933%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
934 ; dir, d_idx, mask_dir, bidir
935%define edgesd %1
936%define stepd %2
937%define mask_mvd %3
938%define dir %4
939%define d_idx %5
940%define mask_dir %6
941%define bidir %7
942 xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
943%%.b_idx_loop:
944%if mask_dir == 0
945 pxor m0, m0
946%endif
947 test b_idxd, dword mask_mvd
948 jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv))
949%if bidir == 1
950 movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
951 punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
952 pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] }
953 pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] }
954 pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] }
955 psubb m0, m2 ; { ref0[b] != ref0[bn],
956 ; ref0[b] != ref1[bn] }
957 psubb m1, m3 ; { ref1[b] != ref1[bn],
958 ; ref1[b] != ref0[bn] }
959
960 por m0, m1
961 mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
962 mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
963 mova m3, m1
964 mova m4, m2
965 psubw m1, [mvq+b_idxq*4+12*4]
966 psubw m2, [mvq+b_idxq*4+12*4+mmsize]
967 psubw m3, [mvq+b_idxq*4+52*4]
968 psubw m4, [mvq+b_idxq*4+52*4+mmsize]
969 packsswb m1, m2
970 packsswb m3, m4
971 paddb m1, m6
972 paddb m3, m6
973 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
974 psubusb m3, m5
975 packsswb m1, m3
976
977 por m0, m1
978 mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
979 mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
980 mova m3, m1
981 mova m4, m2
982 psubw m1, [mvq+b_idxq*4+12*4]
983 psubw m2, [mvq+b_idxq*4+12*4+mmsize]
984 psubw m3, [mvq+b_idxq*4+52*4]
985 psubw m4, [mvq+b_idxq*4+52*4+mmsize]
986 packsswb m1, m2
987 packsswb m3, m4
988 paddb m1, m6
989 paddb m3, m6
990 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
991 psubusb m3, m5
992 packsswb m1, m3
993
994 pshufw m1, m1, 0x4E
995 por m0, m1
996 pshufw m1, m0, 0x4E
997 pminub m0, m1
998%else ; bidir == 0
999 movd m0, [refq+b_idxq+12]
1000 psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
1001
1002 mova m1, [mvq+b_idxq*4+12*4]
1003 mova m2, [mvq+b_idxq*4+12*4+mmsize]
1004 psubw m1, [mvq+b_idxq*4+(d_idx+12)*4]
1005 psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1006 packsswb m1, m2
1007 paddb m1, m6
1008 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1009 packsswb m1, m1
1010 por m0, m1
1011%endif ; bidir == 1/0
1012
1013%%.skip_loop_iter:
1014 movd m1, [nnzq+b_idxq+12]
1015 por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
1016
1017 pminub m1, m7
1018 pminub m0, m7
1019 psllw m1, 1
1020 pxor m2, m2
1021 pmaxub m1, m0
1022 punpcklbw m1, m2
1023 movq [bsq+b_idxq+32*dir], m1
1024
1025 add b_idxd, dword stepd
1026 cmp b_idxd, dword edgesd
1027 jl %%.b_idx_loop
1028%endmacro
1029
26301caa 1030INIT_MMX mmxext
b829b4ce
RB
1031cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
1032 step, mask_mv0, mask_mv1, field
1033%define b_idxq bidirq
1034%define b_idxd bidird
1035 cmp dword fieldm, 0
1036 mova m7, [pb_1]
1037 mova m5, [pb_3]
1038 je .nofield
1039 mova m5, [pb_3_1]
1040.nofield:
1041 mova m6, m5
1042 paddb m5, m5
1043
1044 shl dword stepd, 3
1045 shl dword edgesd, 3
1046%if ARCH_X86_32
1047%define mask_mv0d mask_mv0m
1048%define mask_mv1d mask_mv1m
1049%endif
1050 shl dword mask_mv1d, 3
1051 shl dword mask_mv0d, 3
1052
1053 cmp dword bidird, 0
1054 jne .bidir
1055 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0
1056 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0
1057
1058 mova m0, [bsq+mmsize*0]
1059 mova m1, [bsq+mmsize*1]
1060 mova m2, [bsq+mmsize*2]
1061 mova m3, [bsq+mmsize*3]
1062 TRANSPOSE4x4W 0, 1, 2, 3, 4
1063 mova [bsq+mmsize*0], m0
1064 mova [bsq+mmsize*1], m1
1065 mova [bsq+mmsize*2], m2
1066 mova [bsq+mmsize*3], m3
1067 RET
1068
1069.bidir:
1070 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1
1071 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1
1072
1073 mova m0, [bsq+mmsize*0]
1074 mova m1, [bsq+mmsize*1]
1075 mova m2, [bsq+mmsize*2]
1076 mova m3, [bsq+mmsize*3]
1077 TRANSPOSE4x4W 0, 1, 2, 3, 4
1078 mova [bsq+mmsize*0], m0
1079 mova [bsq+mmsize*1], m1
1080 mova [bsq+mmsize*2], m2
1081 mova [bsq+mmsize*3], m3
1082 RET