h264/x86: sign extend int stride in deblock functions
[libav.git] / libavcodec / x86 / h264_deblock.asm
1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized H.264 deblocking code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Oskar Arvidsson <oskar@irock.se>
9 ;*
10 ;* This file is part of Libav.
11 ;*
12 ;* Libav is free software; you can redistribute it and/or
13 ;* modify it under the terms of the GNU Lesser General Public
14 ;* License as published by the Free Software Foundation; either
15 ;* version 2.1 of the License, or (at your option) any later version.
16 ;*
17 ;* Libav is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 ;* Lesser General Public License for more details.
21 ;*
22 ;* You should have received a copy of the GNU Lesser General Public
23 ;* License along with Libav; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 ;******************************************************************************
26
27 %include "libavutil/x86/x86util.asm"
28
29 SECTION_RODATA
30
31 pb_A1: times 16 db 0xA1
32 pb_3_1: times 4 db 3, 1
33
34 SECTION .text
35
36 cextern pb_0
37 cextern pb_1
38 cextern pb_3
39
40 ; expands to [base],...,[base+7*stride]
41 %define PASS8ROWS(base, base3, stride, stride3) \
42 [base], [base+stride], [base+stride*2], [base3], \
43 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
44
45 %define PASS8ROWS(base, base3, stride, stride3, offset) \
46 PASS8ROWS(base+offset, base3+offset, stride, stride3)
47
48 ; in: 8 rows of 4 bytes in %4..%11
49 ; out: 4 rows of 8 bytes in m0..m3
50 %macro TRANSPOSE4x8_LOAD 11
51 movh m0, %4
52 movh m2, %5
53 movh m1, %6
54 movh m3, %7
55 punpckl%1 m0, m2
56 punpckl%1 m1, m3
57 mova m2, m0
58 punpckl%2 m0, m1
59 punpckh%2 m2, m1
60
61 movh m4, %8
62 movh m6, %9
63 movh m5, %10
64 movh m7, %11
65 punpckl%1 m4, m6
66 punpckl%1 m5, m7
67 mova m6, m4
68 punpckl%2 m4, m5
69 punpckh%2 m6, m5
70
71 punpckh%3 m1, m0, m4
72 punpckh%3 m3, m2, m6
73 punpckl%3 m0, m4
74 punpckl%3 m2, m6
75 %endmacro
76
77 ; in: 4 rows of 8 bytes in m0..m3
78 ; out: 8 rows of 4 bytes in %1..%8
79 %macro TRANSPOSE8x4B_STORE 8
80 punpckhdq m4, m0, m0
81 punpckhdq m5, m1, m1
82 punpckhdq m6, m2, m2
83
84 punpcklbw m0, m1
85 punpcklbw m2, m3
86 punpcklwd m1, m0, m2
87 punpckhwd m0, m2
88 movh %1, m1
89 punpckhdq m1, m1
90 movh %2, m1
91 movh %3, m0
92 punpckhdq m0, m0
93 movh %4, m0
94
95 punpckhdq m3, m3
96 punpcklbw m4, m5
97 punpcklbw m6, m3
98 punpcklwd m5, m4, m6
99 punpckhwd m4, m6
100 movh %5, m5
101 punpckhdq m5, m5
102 movh %6, m5
103 movh %7, m4
104 punpckhdq m4, m4
105 movh %8, m4
106 %endmacro
107
108 %macro TRANSPOSE4x8B_LOAD 8
109 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
110 %endmacro
111
112 %macro SBUTTERFLY3 4
113 punpckh%1 %4, %2, %3
114 punpckl%1 %2, %3
115 %endmacro
116
117 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
118 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
119 %macro TRANSPOSE6x8_MEM 9
120 RESET_MM_PERMUTATION
121 movq m0, %1
122 movq m1, %2
123 movq m2, %3
124 movq m3, %4
125 movq m4, %5
126 movq m5, %6
127 movq m6, %7
128 SBUTTERFLY bw, 0, 1, 7
129 SBUTTERFLY bw, 2, 3, 7
130 SBUTTERFLY bw, 4, 5, 7
131 movq [%9+0x10], m3
132 SBUTTERFLY3 bw, m6, %8, m7
133 SBUTTERFLY wd, 0, 2, 3
134 SBUTTERFLY wd, 4, 6, 3
135 punpckhdq m0, m4
136 movq [%9+0x00], m0
137 SBUTTERFLY3 wd, m1, [%9+0x10], m3
138 SBUTTERFLY wd, 5, 7, 0
139 SBUTTERFLY dq, 1, 5, 0
140 SBUTTERFLY dq, 2, 6, 0
141 punpckldq m3, m7
142 movq [%9+0x10], m2
143 movq [%9+0x20], m6
144 movq [%9+0x30], m1
145 movq [%9+0x40], m5
146 movq [%9+0x50], m3
147 RESET_MM_PERMUTATION
148 %endmacro
149
150 ; in: 8 rows of 8 in %1..%8
151 ; out: 8 rows of 8 in %9..%16
152 %macro TRANSPOSE8x8_MEM 16
153 RESET_MM_PERMUTATION
154 movq m0, %1
155 movq m1, %2
156 movq m2, %3
157 movq m3, %4
158 movq m4, %5
159 movq m5, %6
160 movq m6, %7
161 SBUTTERFLY bw, 0, 1, 7
162 SBUTTERFLY bw, 2, 3, 7
163 SBUTTERFLY bw, 4, 5, 7
164 SBUTTERFLY3 bw, m6, %8, m7
165 movq %9, m5
166 SBUTTERFLY wd, 0, 2, 5
167 SBUTTERFLY wd, 4, 6, 5
168 SBUTTERFLY wd, 1, 3, 5
169 movq %11, m6
170 movq m6, %9
171 SBUTTERFLY wd, 6, 7, 5
172 SBUTTERFLY dq, 0, 4, 5
173 SBUTTERFLY dq, 1, 6, 5
174 movq %9, m0
175 movq %10, m4
176 movq %13, m1
177 movq %14, m6
178 SBUTTERFLY3 dq, m2, %11, m0
179 SBUTTERFLY dq, 3, 7, 4
180 movq %11, m2
181 movq %12, m0
182 movq %15, m3
183 movq %16, m7
184 RESET_MM_PERMUTATION
185 %endmacro
186
187 ; out: %4 = |%1-%2|>%3
188 ; clobbers: %5
189 %macro DIFF_GT 5
190 %if avx_enabled == 0
191 mova %5, %2
192 mova %4, %1
193 psubusb %5, %1
194 psubusb %4, %2
195 %else
196 psubusb %5, %2, %1
197 psubusb %4, %1, %2
198 %endif
199 por %4, %5
200 psubusb %4, %3
201 %endmacro
202
203 ; out: %4 = |%1-%2|>%3
204 ; clobbers: %5
205 %macro DIFF_GT2 5
206 %if ARCH_X86_64
207 psubusb %5, %2, %1
208 psubusb %4, %1, %2
209 %else
210 mova %5, %2
211 mova %4, %1
212 psubusb %5, %1
213 psubusb %4, %2
214 %endif
215 psubusb %5, %3
216 psubusb %4, %3
217 pcmpeqb %4, %5
218 %endmacro
219
220 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
221 ; out: m5=beta-1, m7=mask, %3=alpha-1
222 ; clobbers: m4,m6
223 %macro LOAD_MASK 2-3
224 movd m4, %1
225 movd m5, %2
226 SPLATW m4, m4
227 SPLATW m5, m5
228 packuswb m4, m4 ; 16x alpha-1
229 packuswb m5, m5 ; 16x beta-1
230 %if %0>2
231 mova %3, m4
232 %endif
233 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
234 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
235 por m7, m4
236 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
237 por m7, m4
238 pxor m6, m6
239 pcmpeqb m7, m6
240 %endmacro
241
242 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
243 ; out: m1=p0' m2=q0'
244 ; clobbers: m0,3-6
245 %macro DEBLOCK_P0_Q0 0
246 pcmpeqb m4, m4
247 pxor m5, m1, m2 ; p0^q0
248 pxor m3, m4
249 pand m5, [pb_1] ; (p0^q0)&1
250 pavgb m3, m0 ; (p1 - q1 + 256)>>1
251 pxor m4, m1
252 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
253 pavgb m4, m2 ; (q0 - p0 + 256)>>1
254 pavgb m3, m5
255 mova m6, [pb_A1]
256 paddusb m3, m4 ; d+128+33
257 psubusb m6, m3
258 psubusb m3, [pb_A1]
259 pminub m6, m7
260 pminub m3, m7
261 psubusb m1, m6
262 psubusb m2, m3
263 paddusb m1, m3
264 paddusb m2, m6
265 %endmacro
266
267 ; in: m1=p0 m2=q0
268 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
269 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
270 ; clobbers: q2, tmp, tc0
271 %macro LUMA_Q1 6
272 pavgb %6, m1, m2
273 pavgb %2, %6 ; avg(p2,avg(p0,q0))
274 pxor %6, %3
275 pand %6, [pb_1] ; (p2^avg(p0,q0))&1
276 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
277 psubusb %6, %1, %5
278 paddusb %5, %1
279 pmaxub %2, %6
280 pminub %2, %5
281 mova %4, %2
282 %endmacro
283
284 %if ARCH_X86_64
285 ;-----------------------------------------------------------------------------
286 ; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta,
287 ; int8_t *tc0)
288 ;-----------------------------------------------------------------------------
289 %macro DEBLOCK_LUMA 0
290 cglobal deblock_v_luma_8, 5,5,10
291 movsxdifnidn r1, r1d
292 movd m8, [r4] ; tc0
293 lea r4, [r1*3]
294 dec r2d ; alpha-1
295 neg r4
296 dec r3d ; beta-1
297 add r4, r0 ; pix-3*stride
298
299 mova m0, [r4+r1] ; p1
300 mova m1, [r4+2*r1] ; p0
301 mova m2, [r0] ; q0
302 mova m3, [r0+r1] ; q1
303 LOAD_MASK r2d, r3d
304
305 punpcklbw m8, m8
306 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
307 pcmpeqb m9, m9
308 pcmpeqb m9, m8
309 pandn m9, m7
310 pand m8, m9
311
312 movdqa m3, [r4] ; p2
313 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
314 pand m6, m9
315 psubb m7, m8, m6
316 pand m6, m8
317 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
318
319 movdqa m4, [r0+2*r1] ; q2
320 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
321 pand m6, m9
322 pand m8, m6
323 psubb m7, m6
324 mova m3, [r0+r1]
325 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
326
327 DEBLOCK_P0_Q0
328 mova [r4+2*r1], m1
329 mova [r0], m2
330 RET
331
332 ;-----------------------------------------------------------------------------
333 ; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
334 ; int8_t *tc0)
335 ;-----------------------------------------------------------------------------
336 INIT_MMX cpuname
337 cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
338 movsxd r7, r1d
339 movsxdifnidn r1, r1d
340 lea r8, [r7+r7*2]
341 lea r6, [r0-4]
342 lea r5, [r0-4+r8]
343 %if WIN64
344 %define pix_tmp rsp+0x30 ; shadow space + r4
345 %else
346 %define pix_tmp rsp
347 %endif
348
349 ; transpose 6x16 -> tmp space
350 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
351 lea r6, [r6+r7*8]
352 lea r5, [r5+r7*8]
353 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
354
355 ; vertical filter
356 ; alpha, beta, tc0 are still in r2d, r3d, r4
357 ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
358 lea r0, [pix_tmp+0x30]
359 mov r1d, 0x10
360 %if WIN64
361 mov [rsp+0x20], r4
362 %endif
363 call deblock_v_luma_8
364
365 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
366 add r6, 2
367 add r5, 2
368 movq m0, [pix_tmp+0x18]
369 movq m1, [pix_tmp+0x28]
370 movq m2, [pix_tmp+0x38]
371 movq m3, [pix_tmp+0x48]
372 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
373
374 shl r7, 3
375 sub r6, r7
376 sub r5, r7
377 shr r7, 3
378 movq m0, [pix_tmp+0x10]
379 movq m1, [pix_tmp+0x20]
380 movq m2, [pix_tmp+0x30]
381 movq m3, [pix_tmp+0x40]
382 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
383
384 RET
385 %endmacro
386
387 INIT_XMM sse2
388 DEBLOCK_LUMA
389 INIT_XMM avx
390 DEBLOCK_LUMA
391
392 %else
393
394 %macro DEBLOCK_LUMA 2
395 ;-----------------------------------------------------------------------------
396 ; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta,
397 ; int8_t *tc0)
398 ;-----------------------------------------------------------------------------
399 cglobal deblock_%1_luma_8, 5,5,8,2*%2
400 movsxdifnidn r1, r1d
401 lea r4, [r1*3]
402 dec r2 ; alpha-1
403 neg r4
404 dec r3 ; beta-1
405 add r4, r0 ; pix-3*stride
406
407 mova m0, [r4+r1] ; p1
408 mova m1, [r4+2*r1] ; p0
409 mova m2, [r0] ; q0
410 mova m3, [r0+r1] ; q1
411 LOAD_MASK r2, r3
412
413 mov r3, r4mp
414 pcmpeqb m3, m3
415 movd m4, [r3] ; tc0
416 punpcklbw m4, m4
417 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
418 mova [esp+%2], m4 ; tc
419 pcmpgtb m4, m3
420 mova m3, [r4] ; p2
421 pand m4, m7
422 mova [esp], m4 ; mask
423
424 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
425 pand m6, m4
426 pand m4, [esp+%2] ; tc
427 psubb m7, m4, m6
428 pand m6, m4
429 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
430
431 mova m4, [r0+2*r1] ; q2
432 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
433 pand m6, [esp] ; mask
434 mova m5, [esp+%2] ; tc
435 psubb m7, m6
436 pand m5, m6
437 mova m3, [r0+r1]
438 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
439
440 DEBLOCK_P0_Q0
441 mova [r4+2*r1], m1
442 mova [r0], m2
443 RET
444
445 ;-----------------------------------------------------------------------------
446 ; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
447 ; int8_t *tc0)
448 ;-----------------------------------------------------------------------------
449 INIT_MMX cpuname
450 cglobal deblock_h_luma_8, 0,5,8,0x60+12
451 movsxdifnidn r1, r1d
452 mov r0, r0mp
453 mov r3, r1m
454 lea r4, [r3*3]
455 sub r0, 4
456 lea r1, [r0+r4]
457 %define pix_tmp esp+12
458
459 ; transpose 6x16 -> tmp space
460 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
461 lea r0, [r0+r3*8]
462 lea r1, [r1+r3*8]
463 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
464
465 ; vertical filter
466 lea r0, [pix_tmp+0x30]
467 PUSH dword r4m
468 PUSH dword r3m
469 PUSH dword r2m
470 PUSH dword 16
471 PUSH dword r0
472 call deblock_%1_luma_8
473 %ifidn %1, v8
474 add dword [esp ], 8 ; pix_tmp+0x38
475 add dword [esp+16], 2 ; tc0+2
476 call deblock_%1_luma_8
477 %endif
478 ADD esp, 20
479
480 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
481 mov r0, r0mp
482 sub r0, 2
483
484 movq m0, [pix_tmp+0x10]
485 movq m1, [pix_tmp+0x20]
486 lea r1, [r0+r4]
487 movq m2, [pix_tmp+0x30]
488 movq m3, [pix_tmp+0x40]
489 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
490
491 lea r0, [r0+r3*8]
492 lea r1, [r1+r3*8]
493 movq m0, [pix_tmp+0x18]
494 movq m1, [pix_tmp+0x28]
495 movq m2, [pix_tmp+0x38]
496 movq m3, [pix_tmp+0x48]
497 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
498
499 RET
500 %endmacro ; DEBLOCK_LUMA
501
502 INIT_MMX mmxext
503 DEBLOCK_LUMA v8, 8
504 INIT_XMM sse2
505 DEBLOCK_LUMA v, 16
506 INIT_XMM avx
507 DEBLOCK_LUMA v, 16
508
509 %endif ; ARCH
510
511
512
513 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
514 %if ARCH_X86_64
515 pavgb t0, p2, p1
516 pavgb t1, p0, q0
517 %else
518 mova t0, p2
519 mova t1, p0
520 pavgb t0, p1
521 pavgb t1, q0
522 %endif
523 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
524 mova t5, t1
525 %if ARCH_X86_64
526 paddb t2, p2, p1
527 paddb t3, p0, q0
528 %else
529 mova t2, p2
530 mova t3, p0
531 paddb t2, p1
532 paddb t3, q0
533 %endif
534 paddb t2, t3
535 mova t3, t2
536 mova t4, t2
537 psrlw t2, 1
538 pavgb t2, mpb_0
539 pxor t2, t0
540 pand t2, mpb_1
541 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
542
543 %if ARCH_X86_64
544 pavgb t1, p2, q1
545 psubb t2, p2, q1
546 %else
547 mova t1, p2
548 mova t2, p2
549 pavgb t1, q1
550 psubb t2, q1
551 %endif
552 paddb t3, t3
553 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
554 pand t2, mpb_1
555 psubb t1, t2
556 pavgb t1, p1
557 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
558 psrlw t3, 2
559 pavgb t3, mpb_0
560 pxor t3, t1
561 pand t3, mpb_1
562 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
563
564 pxor t3, p0, q1
565 pavgb t2, p0, q1
566 pand t3, mpb_1
567 psubb t2, t3
568 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
569
570 pxor t1, t2
571 pxor t2, p0
572 pand t1, mask1p
573 pand t2, mask0
574 pxor t1, t2
575 pxor t1, p0
576 mova %1, t1 ; store p0
577
578 mova t1, %4 ; p3
579 paddb t2, t1, p2
580 pavgb t1, p2
581 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
582 paddb t2, t2
583 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
584 psrlw t2, 2
585 pavgb t2, mpb_0
586 pxor t2, t1
587 pand t2, mpb_1
588 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
589
590 pxor t0, p1
591 pxor t1, p2
592 pand t0, mask1p
593 pand t1, mask1p
594 pxor t0, p1
595 pxor t1, p2
596 mova %2, t0 ; store p1
597 mova %3, t1 ; store p2
598 %endmacro
599
600 %macro LUMA_INTRA_SWAP_PQ 0
601 %define q1 m0
602 %define q0 m1
603 %define p0 m2
604 %define p1 m3
605 %define p2 q2
606 %define mask1p mask1q
607 %endmacro
608
609 %macro DEBLOCK_LUMA_INTRA 1
610 %define p1 m0
611 %define p0 m1
612 %define q0 m2
613 %define q1 m3
614 %define t0 m4
615 %define t1 m5
616 %define t2 m6
617 %define t3 m7
618 %if ARCH_X86_64
619 %define p2 m8
620 %define q2 m9
621 %define t4 m10
622 %define t5 m11
623 %define mask0 m12
624 %define mask1p m13
625 %if WIN64
626 %define mask1q [rsp]
627 %else
628 %define mask1q [rsp-24]
629 %endif
630 %define mpb_0 m14
631 %define mpb_1 m15
632 %else
633 %define spill(x) [esp+16*x]
634 %define p2 [r4+r1]
635 %define q2 [r0+2*r1]
636 %define t4 spill(0)
637 %define t5 spill(1)
638 %define mask0 spill(2)
639 %define mask1p spill(3)
640 %define mask1q spill(4)
641 %define mpb_0 [pb_0]
642 %define mpb_1 [pb_1]
643 %endif
644
645 ;-----------------------------------------------------------------------------
646 ; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
647 ;-----------------------------------------------------------------------------
648 %if WIN64
649 cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
650 %else
651 cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
652 %endif
653 movsxdifnidn r1, r1d
654 lea r4, [r1*4]
655 lea r5, [r1*3] ; 3*stride
656 dec r2d ; alpha-1
657 jl .end
658 neg r4
659 dec r3d ; beta-1
660 jl .end
661 add r4, r0 ; pix-4*stride
662 mova p1, [r4+2*r1]
663 mova p0, [r4+r5]
664 mova q0, [r0]
665 mova q1, [r0+r1]
666 %if ARCH_X86_64
667 pxor mpb_0, mpb_0
668 mova mpb_1, [pb_1]
669 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
670 SWAP 7, 12 ; m12=mask0
671 pavgb t5, mpb_0
672 pavgb t5, mpb_1 ; alpha/4+1
673 movdqa p2, [r4+r1]
674 movdqa q2, [r0+2*r1]
675 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
676 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
677 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
678 pand t0, mask0
679 pand t4, t0
680 pand t2, t0
681 mova mask1q, t4
682 mova mask1p, t2
683 %else
684 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
685 mova m4, t5
686 mova mask0, m7
687 pavgb m4, [pb_0]
688 pavgb m4, [pb_1] ; alpha/4+1
689 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
690 pand m6, mask0
691 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
692 pand m4, m6
693 mova mask1p, m4
694 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
695 pand m4, m6
696 mova mask1q, m4
697 %endif
698 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
699 LUMA_INTRA_SWAP_PQ
700 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
701 .end:
702 RET
703
704 INIT_MMX cpuname
705 %if ARCH_X86_64
706 ;-----------------------------------------------------------------------------
707 ; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
708 ;-----------------------------------------------------------------------------
709 cglobal deblock_h_luma_intra_8, 4,9,0,0x80
710 movsxd r7, r1d
711 movsxdifnidn r1, r1d
712 lea r8, [r7*3]
713 lea r6, [r0-4]
714 lea r5, [r0-4+r8]
715 %if WIN64
716 %define pix_tmp rsp+0x20 ; shadow space
717 %else
718 %define pix_tmp rsp
719 %endif
720
721 ; transpose 8x16 -> tmp space
722 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
723 lea r6, [r6+r7*8]
724 lea r5, [r5+r7*8]
725 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
726
727 lea r0, [pix_tmp+0x40]
728 mov r1, 0x10
729 call deblock_v_luma_intra_8
730
731 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
732 lea r5, [r6+r8]
733 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
734 shl r7, 3
735 sub r6, r7
736 sub r5, r7
737 shr r7, 3
738 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
739 RET
740 %else
741 cglobal deblock_h_luma_intra_8, 2,4,8,0x80
742 lea r3, [r1*3]
743 sub r0, 4
744 lea r2, [r0+r3]
745 %define pix_tmp rsp
746
747 ; transpose 8x16 -> tmp space
748 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
749 lea r0, [r0+r1*8]
750 lea r2, [r2+r1*8]
751 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
752
753 lea r0, [pix_tmp+0x40]
754 PUSH dword r3m
755 PUSH dword r2m
756 PUSH dword 16
757 PUSH r0
758 call deblock_%1_luma_intra_8
759 %ifidn %1, v8
760 add dword [rsp], 8 ; pix_tmp+8
761 call deblock_%1_luma_intra_8
762 %endif
763 ADD esp, 16
764
765 mov r1, r1m
766 mov r0, r0mp
767 lea r3, [r1*3]
768 sub r0, 4
769 lea r2, [r0+r3]
770 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
771 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
772 lea r0, [r0+r1*8]
773 lea r2, [r2+r1*8]
774 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
775 RET
776 %endif ; ARCH_X86_64
777 %endmacro ; DEBLOCK_LUMA_INTRA
778
779 INIT_XMM sse2
780 DEBLOCK_LUMA_INTRA v
781 INIT_XMM avx
782 DEBLOCK_LUMA_INTRA v
783 %if ARCH_X86_64 == 0
784 INIT_MMX mmxext
785 DEBLOCK_LUMA_INTRA v8
786 %endif
787
788 INIT_MMX mmxext
789
790 %macro CHROMA_V_START 0
791 movsxdifnidn r1, r1d
792 dec r2d ; alpha-1
793 dec r3d ; beta-1
794 mov t5, r0
795 sub t5, r1
796 sub t5, r1
797 %endmacro
798
799 %macro CHROMA_H_START 0
800 movsxdifnidn r1, r1d
801 dec r2d
802 dec r3d
803 sub r0, 2
804 lea t6, [r1*3]
805 mov t5, r0
806 add r0, t6
807 %endmacro
808
809 %define t5 r5
810 %define t6 r6
811
812 ;-----------------------------------------------------------------------------
813 ; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta,
814 ; int8_t *tc0)
815 ;-----------------------------------------------------------------------------
816 cglobal deblock_v_chroma_8, 5,6
817 CHROMA_V_START
818 movq m0, [t5]
819 movq m1, [t5+r1]
820 movq m2, [r0]
821 movq m3, [r0+r1]
822 call ff_chroma_inter_body_mmxext
823 movq [t5+r1], m1
824 movq [r0], m2
825 RET
826
827 ;-----------------------------------------------------------------------------
828 ; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta,
829 ; int8_t *tc0)
830 ;-----------------------------------------------------------------------------
831 cglobal deblock_h_chroma_8, 5,7
832 %if ARCH_X86_64
833 ; This could use the red zone on 64 bit unix to avoid the stack pointer
834 ; readjustment, but valgrind assumes the red zone is clobbered on
835 ; function calls and returns.
836 sub rsp, 16
837 %define buf0 [rsp]
838 %define buf1 [rsp+8]
839 %else
840 %define buf0 r0m
841 %define buf1 r2m
842 %endif
843 CHROMA_H_START
844 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
845 movq buf0, m0
846 movq buf1, m3
847 call ff_chroma_inter_body_mmxext
848 movq m0, buf0
849 movq m3, buf1
850 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
851 %if ARCH_X86_64
852 add rsp, 16
853 %endif
854 RET
855
856 ALIGN 16
857 ff_chroma_inter_body_mmxext:
858 LOAD_MASK r2d, r3d
859 movd m6, [r4] ; tc0
860 punpcklbw m6, m6
861 pand m7, m6
862 DEBLOCK_P0_Q0
863 ret
864
865
866
867 ; in: %1=p0 %2=p1 %3=q1
868 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
869 %macro CHROMA_INTRA_P0 3
870 movq m4, %1
871 pxor m4, %3
872 pand m4, [pb_1] ; m4 = (p0^q1)&1
873 pavgb %1, %3
874 psubusb %1, m4
875 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
876 %endmacro
877
878 %define t5 r4
879 %define t6 r5
880
881 ;------------------------------------------------------------------------------
882 ; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
883 ;------------------------------------------------------------------------------
884 cglobal deblock_v_chroma_intra_8, 4,5
885 CHROMA_V_START
886 movq m0, [t5]
887 movq m1, [t5+r1]
888 movq m2, [r0]
889 movq m3, [r0+r1]
890 call ff_chroma_intra_body_mmxext
891 movq [t5+r1], m1
892 movq [r0], m2
893 RET
894
895 ;------------------------------------------------------------------------------
896 ; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
897 ;------------------------------------------------------------------------------
898 cglobal deblock_h_chroma_intra_8, 4,6
899 CHROMA_H_START
900 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
901 call ff_chroma_intra_body_mmxext
902 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
903 RET
904
905 ALIGN 16
906 ff_chroma_intra_body_mmxext:
907 LOAD_MASK r2d, r3d
908 movq m5, m1
909 movq m6, m2
910 CHROMA_INTRA_P0 m1, m0, m3
911 CHROMA_INTRA_P0 m2, m3, m0
912 psubb m1, m5
913 psubb m2, m6
914 pand m1, m7
915 pand m2, m7
916 paddb m1, m5
917 paddb m2, m6
918 ret
919
920 ;-----------------------------------------------------------------------------
921 ; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
922 ; int8_t ref[2][40], int16_t mv[2][40][2],
923 ; int bidir, int edges, int step,
924 ; int mask_mv0, int mask_mv1, int field);
925 ;
926 ; bidir is 0 or 1
927 ; edges is 1 or 4
928 ; step is 1 or 2
929 ; mask_mv0 is 0 or 3
930 ; mask_mv1 is 0 or 1
931 ; field is 0 or 1
932 ;-----------------------------------------------------------------------------
933 %macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
934 ; dir, d_idx, mask_dir, bidir
935 %define edgesd %1
936 %define stepd %2
937 %define mask_mvd %3
938 %define dir %4
939 %define d_idx %5
940 %define mask_dir %6
941 %define bidir %7
942 xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
943 %%.b_idx_loop:
944 %if mask_dir == 0
945 pxor m0, m0
946 %endif
947 test b_idxd, dword mask_mvd
948 jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv))
949 %if bidir == 1
950 movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
951 punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
952 pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] }
953 pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] }
954 pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] }
955 psubb m0, m2 ; { ref0[b] != ref0[bn],
956 ; ref0[b] != ref1[bn] }
957 psubb m1, m3 ; { ref1[b] != ref1[bn],
958 ; ref1[b] != ref0[bn] }
959
960 por m0, m1
961 mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
962 mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
963 mova m3, m1
964 mova m4, m2
965 psubw m1, [mvq+b_idxq*4+12*4]
966 psubw m2, [mvq+b_idxq*4+12*4+mmsize]
967 psubw m3, [mvq+b_idxq*4+52*4]
968 psubw m4, [mvq+b_idxq*4+52*4+mmsize]
969 packsswb m1, m2
970 packsswb m3, m4
971 paddb m1, m6
972 paddb m3, m6
973 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
974 psubusb m3, m5
975 packsswb m1, m3
976
977 por m0, m1
978 mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
979 mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
980 mova m3, m1
981 mova m4, m2
982 psubw m1, [mvq+b_idxq*4+12*4]
983 psubw m2, [mvq+b_idxq*4+12*4+mmsize]
984 psubw m3, [mvq+b_idxq*4+52*4]
985 psubw m4, [mvq+b_idxq*4+52*4+mmsize]
986 packsswb m1, m2
987 packsswb m3, m4
988 paddb m1, m6
989 paddb m3, m6
990 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
991 psubusb m3, m5
992 packsswb m1, m3
993
994 pshufw m1, m1, 0x4E
995 por m0, m1
996 pshufw m1, m0, 0x4E
997 pminub m0, m1
998 %else ; bidir == 0
999 movd m0, [refq+b_idxq+12]
1000 psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
1001
1002 mova m1, [mvq+b_idxq*4+12*4]
1003 mova m2, [mvq+b_idxq*4+12*4+mmsize]
1004 psubw m1, [mvq+b_idxq*4+(d_idx+12)*4]
1005 psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1006 packsswb m1, m2
1007 paddb m1, m6
1008 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1009 packsswb m1, m1
1010 por m0, m1
1011 %endif ; bidir == 1/0
1012
1013 %%.skip_loop_iter:
1014 movd m1, [nnzq+b_idxq+12]
1015 por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
1016
1017 pminub m1, m7
1018 pminub m0, m7
1019 psllw m1, 1
1020 pxor m2, m2
1021 pmaxub m1, m0
1022 punpcklbw m1, m2
1023 movq [bsq+b_idxq+32*dir], m1
1024
1025 add b_idxd, dword stepd
1026 cmp b_idxd, dword edgesd
1027 jl %%.b_idx_loop
1028 %endmacro
1029
1030 INIT_MMX mmxext
1031 cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
1032 step, mask_mv0, mask_mv1, field
1033 %define b_idxq bidirq
1034 %define b_idxd bidird
1035 cmp dword fieldm, 0
1036 mova m7, [pb_1]
1037 mova m5, [pb_3]
1038 je .nofield
1039 mova m5, [pb_3_1]
1040 .nofield:
1041 mova m6, m5
1042 paddb m5, m5
1043
1044 shl dword stepd, 3
1045 shl dword edgesd, 3
1046 %if ARCH_X86_32
1047 %define mask_mv0d mask_mv0m
1048 %define mask_mv1d mask_mv1m
1049 %endif
1050 shl dword mask_mv1d, 3
1051 shl dword mask_mv0d, 3
1052
1053 cmp dword bidird, 0
1054 jne .bidir
1055 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0
1056 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0
1057
1058 mova m0, [bsq+mmsize*0]
1059 mova m1, [bsq+mmsize*1]
1060 mova m2, [bsq+mmsize*2]
1061 mova m3, [bsq+mmsize*3]
1062 TRANSPOSE4x4W 0, 1, 2, 3, 4
1063 mova [bsq+mmsize*0], m0
1064 mova [bsq+mmsize*1], m1
1065 mova [bsq+mmsize*2], m2
1066 mova [bsq+mmsize*3], m3
1067 RET
1068
1069 .bidir:
1070 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1
1071 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1
1072
1073 mova m0, [bsq+mmsize*0]
1074 mova m1, [bsq+mmsize*1]
1075 mova m2, [bsq+mmsize*2]
1076 mova m3, [bsq+mmsize*3]
1077 TRANSPOSE4x4W 0, 1, 2, 3, 4
1078 mova [bsq+mmsize*0], m0
1079 mova [bsq+mmsize*1], m1
1080 mova [bsq+mmsize*2], m2
1081 mova [bsq+mmsize*3], m3
1082 RET