d2067c86e776df31819e63cbf57cbcd88492dff3
[libav.git] / libavcodec / x86 / h264_deblock.asm
1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized H.264 deblocking code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Fiona Glaser <fiona@x264.com>
8 ;* Oskar Arvidsson <oskar@irock.se>
9 ;*
10 ;* This file is part of Libav.
11 ;*
12 ;* Libav is free software; you can redistribute it and/or
13 ;* modify it under the terms of the GNU Lesser General Public
14 ;* License as published by the Free Software Foundation; either
15 ;* version 2.1 of the License, or (at your option) any later version.
16 ;*
17 ;* Libav is distributed in the hope that it will be useful,
18 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 ;* Lesser General Public License for more details.
21 ;*
22 ;* You should have received a copy of the GNU Lesser General Public
23 ;* License along with Libav; if not, write to the Free Software
24 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 ;******************************************************************************
26
27 %include "libavutil/x86/x86util.asm"
28
29 SECTION_RODATA
30
31 pb_A1: times 16 db 0xA1
32 pb_3_1: times 4 db 3, 1
33
34 SECTION .text
35
36 cextern pb_0
37 cextern pb_1
38 cextern pb_3
39
40 ; expands to [base],...,[base+7*stride]
41 %define PASS8ROWS(base, base3, stride, stride3) \
42 [base], [base+stride], [base+stride*2], [base3], \
43 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
44
45 %define PASS8ROWS(base, base3, stride, stride3, offset) \
46 PASS8ROWS(base+offset, base3+offset, stride, stride3)
47
48 ; in: 8 rows of 4 bytes in %4..%11
49 ; out: 4 rows of 8 bytes in m0..m3
50 %macro TRANSPOSE4x8_LOAD 11
51 movh m0, %4
52 movh m2, %5
53 movh m1, %6
54 movh m3, %7
55 punpckl%1 m0, m2
56 punpckl%1 m1, m3
57 mova m2, m0
58 punpckl%2 m0, m1
59 punpckh%2 m2, m1
60
61 movh m4, %8
62 movh m6, %9
63 movh m5, %10
64 movh m7, %11
65 punpckl%1 m4, m6
66 punpckl%1 m5, m7
67 mova m6, m4
68 punpckl%2 m4, m5
69 punpckh%2 m6, m5
70
71 punpckh%3 m1, m0, m4
72 punpckh%3 m3, m2, m6
73 punpckl%3 m0, m4
74 punpckl%3 m2, m6
75 %endmacro
76
77 ; in: 4 rows of 8 bytes in m0..m3
78 ; out: 8 rows of 4 bytes in %1..%8
79 %macro TRANSPOSE8x4B_STORE 8
80 punpckhdq m4, m0, m0
81 punpckhdq m5, m1, m1
82 punpckhdq m6, m2, m2
83
84 punpcklbw m0, m1
85 punpcklbw m2, m3
86 punpcklwd m1, m0, m2
87 punpckhwd m0, m2
88 movh %1, m1
89 punpckhdq m1, m1
90 movh %2, m1
91 movh %3, m0
92 punpckhdq m0, m0
93 movh %4, m0
94
95 punpckhdq m3, m3
96 punpcklbw m4, m5
97 punpcklbw m6, m3
98 punpcklwd m5, m4, m6
99 punpckhwd m4, m6
100 movh %5, m5
101 punpckhdq m5, m5
102 movh %6, m5
103 movh %7, m4
104 punpckhdq m4, m4
105 movh %8, m4
106 %endmacro
107
108 %macro TRANSPOSE4x8B_LOAD 8
109 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
110 %endmacro
111
112 %macro SBUTTERFLY3 4
113 punpckh%1 %4, %2, %3
114 punpckl%1 %2, %3
115 %endmacro
116
117 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
118 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
119 %macro TRANSPOSE6x8_MEM 9
120 RESET_MM_PERMUTATION
121 movq m0, %1
122 movq m1, %2
123 movq m2, %3
124 movq m3, %4
125 movq m4, %5
126 movq m5, %6
127 movq m6, %7
128 SBUTTERFLY bw, 0, 1, 7
129 SBUTTERFLY bw, 2, 3, 7
130 SBUTTERFLY bw, 4, 5, 7
131 movq [%9+0x10], m3
132 SBUTTERFLY3 bw, m6, %8, m7
133 SBUTTERFLY wd, 0, 2, 3
134 SBUTTERFLY wd, 4, 6, 3
135 punpckhdq m0, m4
136 movq [%9+0x00], m0
137 SBUTTERFLY3 wd, m1, [%9+0x10], m3
138 SBUTTERFLY wd, 5, 7, 0
139 SBUTTERFLY dq, 1, 5, 0
140 SBUTTERFLY dq, 2, 6, 0
141 punpckldq m3, m7
142 movq [%9+0x10], m2
143 movq [%9+0x20], m6
144 movq [%9+0x30], m1
145 movq [%9+0x40], m5
146 movq [%9+0x50], m3
147 RESET_MM_PERMUTATION
148 %endmacro
149
150 ; in: 8 rows of 8 in %1..%8
151 ; out: 8 rows of 8 in %9..%16
152 %macro TRANSPOSE8x8_MEM 16
153 RESET_MM_PERMUTATION
154 movq m0, %1
155 movq m1, %2
156 movq m2, %3
157 movq m3, %4
158 movq m4, %5
159 movq m5, %6
160 movq m6, %7
161 SBUTTERFLY bw, 0, 1, 7
162 SBUTTERFLY bw, 2, 3, 7
163 SBUTTERFLY bw, 4, 5, 7
164 SBUTTERFLY3 bw, m6, %8, m7
165 movq %9, m5
166 SBUTTERFLY wd, 0, 2, 5
167 SBUTTERFLY wd, 4, 6, 5
168 SBUTTERFLY wd, 1, 3, 5
169 movq %11, m6
170 movq m6, %9
171 SBUTTERFLY wd, 6, 7, 5
172 SBUTTERFLY dq, 0, 4, 5
173 SBUTTERFLY dq, 1, 6, 5
174 movq %9, m0
175 movq %10, m4
176 movq %13, m1
177 movq %14, m6
178 SBUTTERFLY3 dq, m2, %11, m0
179 SBUTTERFLY dq, 3, 7, 4
180 movq %11, m2
181 movq %12, m0
182 movq %15, m3
183 movq %16, m7
184 RESET_MM_PERMUTATION
185 %endmacro
186
187 ; out: %4 = |%1-%2|>%3
188 ; clobbers: %5
189 %macro DIFF_GT 5
190 %if avx_enabled == 0
191 mova %5, %2
192 mova %4, %1
193 psubusb %5, %1
194 psubusb %4, %2
195 %else
196 psubusb %5, %2, %1
197 psubusb %4, %1, %2
198 %endif
199 por %4, %5
200 psubusb %4, %3
201 %endmacro
202
203 ; out: %4 = |%1-%2|>%3
204 ; clobbers: %5
205 %macro DIFF_GT2 5
206 %if ARCH_X86_64
207 psubusb %5, %2, %1
208 psubusb %4, %1, %2
209 %else
210 mova %5, %2
211 mova %4, %1
212 psubusb %5, %1
213 psubusb %4, %2
214 %endif
215 psubusb %5, %3
216 psubusb %4, %3
217 pcmpeqb %4, %5
218 %endmacro
219
220 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
221 ; out: m5=beta-1, m7=mask, %3=alpha-1
222 ; clobbers: m4,m6
223 %macro LOAD_MASK 2-3
224 movd m4, %1
225 movd m5, %2
226 SPLATW m4, m4
227 SPLATW m5, m5
228 packuswb m4, m4 ; 16x alpha-1
229 packuswb m5, m5 ; 16x beta-1
230 %if %0>2
231 mova %3, m4
232 %endif
233 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
234 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
235 por m7, m4
236 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
237 por m7, m4
238 pxor m6, m6
239 pcmpeqb m7, m6
240 %endmacro
241
242 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
243 ; out: m1=p0' m2=q0'
244 ; clobbers: m0,3-6
245 %macro DEBLOCK_P0_Q0 0
246 pcmpeqb m4, m4
247 pxor m5, m1, m2 ; p0^q0
248 pxor m3, m4
249 pand m5, [pb_1] ; (p0^q0)&1
250 pavgb m3, m0 ; (p1 - q1 + 256)>>1
251 pxor m4, m1
252 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
253 pavgb m4, m2 ; (q0 - p0 + 256)>>1
254 pavgb m3, m5
255 mova m6, [pb_A1]
256 paddusb m3, m4 ; d+128+33
257 psubusb m6, m3
258 psubusb m3, [pb_A1]
259 pminub m6, m7
260 pminub m3, m7
261 psubusb m1, m6
262 psubusb m2, m3
263 paddusb m1, m3
264 paddusb m2, m6
265 %endmacro
266
267 ; in: m1=p0 m2=q0
268 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
269 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
270 ; clobbers: q2, tmp, tc0
271 %macro LUMA_Q1 6
272 pavgb %6, m1, m2
273 pavgb %2, %6 ; avg(p2,avg(p0,q0))
274 pxor %6, %3
275 pand %6, [pb_1] ; (p2^avg(p0,q0))&1
276 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
277 psubusb %6, %1, %5
278 paddusb %5, %1
279 pmaxub %2, %6
280 pminub %2, %5
281 mova %4, %2
282 %endmacro
283
284 %if ARCH_X86_64
285 ;-----------------------------------------------------------------------------
286 ; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta,
287 ; int8_t *tc0)
288 ;-----------------------------------------------------------------------------
289 %macro DEBLOCK_LUMA 0
290 cglobal deblock_v_luma_8, 5,5,10
291 movd m8, [r4] ; tc0
292 lea r4, [r1*3]
293 dec r2d ; alpha-1
294 neg r4
295 dec r3d ; beta-1
296 add r4, r0 ; pix-3*stride
297
298 mova m0, [r4+r1] ; p1
299 mova m1, [r4+2*r1] ; p0
300 mova m2, [r0] ; q0
301 mova m3, [r0+r1] ; q1
302 LOAD_MASK r2d, r3d
303
304 punpcklbw m8, m8
305 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
306 pcmpeqb m9, m9
307 pcmpeqb m9, m8
308 pandn m9, m7
309 pand m8, m9
310
311 movdqa m3, [r4] ; p2
312 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
313 pand m6, m9
314 psubb m7, m8, m6
315 pand m6, m8
316 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
317
318 movdqa m4, [r0+2*r1] ; q2
319 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
320 pand m6, m9
321 pand m8, m6
322 psubb m7, m6
323 mova m3, [r0+r1]
324 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
325
326 DEBLOCK_P0_Q0
327 mova [r4+2*r1], m1
328 mova [r0], m2
329 RET
330
331 ;-----------------------------------------------------------------------------
332 ; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
333 ; int8_t *tc0)
334 ;-----------------------------------------------------------------------------
335 INIT_MMX cpuname
336 cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
337 movsxd r7, r1d
338 lea r8, [r7+r7*2]
339 lea r6, [r0-4]
340 lea r5, [r0-4+r8]
341 %if WIN64
342 %define pix_tmp rsp+0x30 ; shadow space + r4
343 %else
344 %define pix_tmp rsp
345 %endif
346
347 ; transpose 6x16 -> tmp space
348 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
349 lea r6, [r6+r7*8]
350 lea r5, [r5+r7*8]
351 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
352
353 ; vertical filter
354 ; alpha, beta, tc0 are still in r2d, r3d, r4
355 ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
356 lea r0, [pix_tmp+0x30]
357 mov r1d, 0x10
358 %if WIN64
359 mov [rsp+0x20], r4
360 %endif
361 call deblock_v_luma_8
362
363 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
364 add r6, 2
365 add r5, 2
366 movq m0, [pix_tmp+0x18]
367 movq m1, [pix_tmp+0x28]
368 movq m2, [pix_tmp+0x38]
369 movq m3, [pix_tmp+0x48]
370 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
371
372 shl r7, 3
373 sub r6, r7
374 sub r5, r7
375 shr r7, 3
376 movq m0, [pix_tmp+0x10]
377 movq m1, [pix_tmp+0x20]
378 movq m2, [pix_tmp+0x30]
379 movq m3, [pix_tmp+0x40]
380 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
381
382 RET
383 %endmacro
384
385 INIT_XMM sse2
386 DEBLOCK_LUMA
387 INIT_XMM avx
388 DEBLOCK_LUMA
389
390 %else
391
392 %macro DEBLOCK_LUMA 2
393 ;-----------------------------------------------------------------------------
394 ; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta,
395 ; int8_t *tc0)
396 ;-----------------------------------------------------------------------------
397 cglobal deblock_%1_luma_8, 5,5,8,2*%2
398 lea r4, [r1*3]
399 dec r2 ; alpha-1
400 neg r4
401 dec r3 ; beta-1
402 add r4, r0 ; pix-3*stride
403
404 mova m0, [r4+r1] ; p1
405 mova m1, [r4+2*r1] ; p0
406 mova m2, [r0] ; q0
407 mova m3, [r0+r1] ; q1
408 LOAD_MASK r2, r3
409
410 mov r3, r4mp
411 pcmpeqb m3, m3
412 movd m4, [r3] ; tc0
413 punpcklbw m4, m4
414 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
415 mova [esp+%2], m4 ; tc
416 pcmpgtb m4, m3
417 mova m3, [r4] ; p2
418 pand m4, m7
419 mova [esp], m4 ; mask
420
421 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
422 pand m6, m4
423 pand m4, [esp+%2] ; tc
424 psubb m7, m4, m6
425 pand m6, m4
426 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
427
428 mova m4, [r0+2*r1] ; q2
429 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
430 pand m6, [esp] ; mask
431 mova m5, [esp+%2] ; tc
432 psubb m7, m6
433 pand m5, m6
434 mova m3, [r0+r1]
435 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
436
437 DEBLOCK_P0_Q0
438 mova [r4+2*r1], m1
439 mova [r0], m2
440 RET
441
442 ;-----------------------------------------------------------------------------
443 ; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
444 ; int8_t *tc0)
445 ;-----------------------------------------------------------------------------
446 INIT_MMX cpuname
447 cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
448 mov r0, r0mp
449 mov r3, r1m
450 lea r4, [r3*3]
451 sub r0, 4
452 lea r1, [r0+r4]
453 %define pix_tmp esp+12*HAVE_ALIGNED_STACK
454
455 ; transpose 6x16 -> tmp space
456 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
457 lea r0, [r0+r3*8]
458 lea r1, [r1+r3*8]
459 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
460
461 ; vertical filter
462 lea r0, [pix_tmp+0x30]
463 PUSH dword r4m
464 PUSH dword r3m
465 PUSH dword r2m
466 PUSH dword 16
467 PUSH dword r0
468 call deblock_%1_luma_8
469 %ifidn %1, v8
470 add dword [esp ], 8 ; pix_tmp+0x38
471 add dword [esp+16], 2 ; tc0+2
472 call deblock_%1_luma_8
473 %endif
474 ADD esp, 20
475
476 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
477 mov r0, r0mp
478 sub r0, 2
479
480 movq m0, [pix_tmp+0x10]
481 movq m1, [pix_tmp+0x20]
482 lea r1, [r0+r4]
483 movq m2, [pix_tmp+0x30]
484 movq m3, [pix_tmp+0x40]
485 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
486
487 lea r0, [r0+r3*8]
488 lea r1, [r1+r3*8]
489 movq m0, [pix_tmp+0x18]
490 movq m1, [pix_tmp+0x28]
491 movq m2, [pix_tmp+0x38]
492 movq m3, [pix_tmp+0x48]
493 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
494
495 RET
496 %endmacro ; DEBLOCK_LUMA
497
498 INIT_MMX mmxext
499 DEBLOCK_LUMA v8, 8
500 INIT_XMM sse2
501 DEBLOCK_LUMA v, 16
502 INIT_XMM avx
503 DEBLOCK_LUMA v, 16
504
505 %endif ; ARCH
506
507
508
509 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
510 %if ARCH_X86_64
511 pavgb t0, p2, p1
512 pavgb t1, p0, q0
513 %else
514 mova t0, p2
515 mova t1, p0
516 pavgb t0, p1
517 pavgb t1, q0
518 %endif
519 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
520 mova t5, t1
521 %if ARCH_X86_64
522 paddb t2, p2, p1
523 paddb t3, p0, q0
524 %else
525 mova t2, p2
526 mova t3, p0
527 paddb t2, p1
528 paddb t3, q0
529 %endif
530 paddb t2, t3
531 mova t3, t2
532 mova t4, t2
533 psrlw t2, 1
534 pavgb t2, mpb_0
535 pxor t2, t0
536 pand t2, mpb_1
537 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
538
539 %if ARCH_X86_64
540 pavgb t1, p2, q1
541 psubb t2, p2, q1
542 %else
543 mova t1, p2
544 mova t2, p2
545 pavgb t1, q1
546 psubb t2, q1
547 %endif
548 paddb t3, t3
549 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
550 pand t2, mpb_1
551 psubb t1, t2
552 pavgb t1, p1
553 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
554 psrlw t3, 2
555 pavgb t3, mpb_0
556 pxor t3, t1
557 pand t3, mpb_1
558 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
559
560 pxor t3, p0, q1
561 pavgb t2, p0, q1
562 pand t3, mpb_1
563 psubb t2, t3
564 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
565
566 pxor t1, t2
567 pxor t2, p0
568 pand t1, mask1p
569 pand t2, mask0
570 pxor t1, t2
571 pxor t1, p0
572 mova %1, t1 ; store p0
573
574 mova t1, %4 ; p3
575 paddb t2, t1, p2
576 pavgb t1, p2
577 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
578 paddb t2, t2
579 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
580 psrlw t2, 2
581 pavgb t2, mpb_0
582 pxor t2, t1
583 pand t2, mpb_1
584 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
585
586 pxor t0, p1
587 pxor t1, p2
588 pand t0, mask1p
589 pand t1, mask1p
590 pxor t0, p1
591 pxor t1, p2
592 mova %2, t0 ; store p1
593 mova %3, t1 ; store p2
594 %endmacro
595
596 %macro LUMA_INTRA_SWAP_PQ 0
597 %define q1 m0
598 %define q0 m1
599 %define p0 m2
600 %define p1 m3
601 %define p2 q2
602 %define mask1p mask1q
603 %endmacro
604
605 %macro DEBLOCK_LUMA_INTRA 1
606 %define p1 m0
607 %define p0 m1
608 %define q0 m2
609 %define q1 m3
610 %define t0 m4
611 %define t1 m5
612 %define t2 m6
613 %define t3 m7
614 %if ARCH_X86_64
615 %define p2 m8
616 %define q2 m9
617 %define t4 m10
618 %define t5 m11
619 %define mask0 m12
620 %define mask1p m13
621 %if WIN64
622 %define mask1q [rsp]
623 %else
624 %define mask1q [rsp-24]
625 %endif
626 %define mpb_0 m14
627 %define mpb_1 m15
628 %else
629 %define spill(x) [esp+16*x]
630 %define p2 [r4+r1]
631 %define q2 [r0+2*r1]
632 %define t4 spill(0)
633 %define t5 spill(1)
634 %define mask0 spill(2)
635 %define mask1p spill(3)
636 %define mask1q spill(4)
637 %define mpb_0 [pb_0]
638 %define mpb_1 [pb_1]
639 %endif
640
641 ;-----------------------------------------------------------------------------
642 ; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
643 ;-----------------------------------------------------------------------------
644 %if WIN64
645 cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
646 %else
647 cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
648 %endif
649 lea r4, [r1*4]
650 lea r5, [r1*3] ; 3*stride
651 dec r2d ; alpha-1
652 jl .end
653 neg r4
654 dec r3d ; beta-1
655 jl .end
656 add r4, r0 ; pix-4*stride
657 mova p1, [r4+2*r1]
658 mova p0, [r4+r5]
659 mova q0, [r0]
660 mova q1, [r0+r1]
661 %if ARCH_X86_64
662 pxor mpb_0, mpb_0
663 mova mpb_1, [pb_1]
664 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
665 SWAP 7, 12 ; m12=mask0
666 pavgb t5, mpb_0
667 pavgb t5, mpb_1 ; alpha/4+1
668 movdqa p2, [r4+r1]
669 movdqa q2, [r0+2*r1]
670 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
671 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
672 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
673 pand t0, mask0
674 pand t4, t0
675 pand t2, t0
676 mova mask1q, t4
677 mova mask1p, t2
678 %else
679 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
680 mova m4, t5
681 mova mask0, m7
682 pavgb m4, [pb_0]
683 pavgb m4, [pb_1] ; alpha/4+1
684 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
685 pand m6, mask0
686 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
687 pand m4, m6
688 mova mask1p, m4
689 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
690 pand m4, m6
691 mova mask1q, m4
692 %endif
693 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
694 LUMA_INTRA_SWAP_PQ
695 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
696 .end:
697 RET
698
699 INIT_MMX cpuname
700 %if ARCH_X86_64
701 ;-----------------------------------------------------------------------------
702 ; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
703 ;-----------------------------------------------------------------------------
704 cglobal deblock_h_luma_intra_8, 4,9,0,0x80
705 movsxd r7, r1d
706 lea r8, [r7*3]
707 lea r6, [r0-4]
708 lea r5, [r0-4+r8]
709 %if WIN64
710 %define pix_tmp rsp+0x20 ; shadow space
711 %else
712 %define pix_tmp rsp
713 %endif
714
715 ; transpose 8x16 -> tmp space
716 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
717 lea r6, [r6+r7*8]
718 lea r5, [r5+r7*8]
719 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
720
721 lea r0, [pix_tmp+0x40]
722 mov r1, 0x10
723 call deblock_v_luma_intra_8
724
725 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
726 lea r5, [r6+r8]
727 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
728 shl r7, 3
729 sub r6, r7
730 sub r5, r7
731 shr r7, 3
732 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
733 RET
734 %else
735 cglobal deblock_h_luma_intra_8, 2,4,8,0x80
736 lea r3, [r1*3]
737 sub r0, 4
738 lea r2, [r0+r3]
739 %define pix_tmp rsp
740
741 ; transpose 8x16 -> tmp space
742 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
743 lea r0, [r0+r1*8]
744 lea r2, [r2+r1*8]
745 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
746
747 lea r0, [pix_tmp+0x40]
748 PUSH dword r3m
749 PUSH dword r2m
750 PUSH dword 16
751 PUSH r0
752 call deblock_%1_luma_intra_8
753 %ifidn %1, v8
754 add dword [rsp], 8 ; pix_tmp+8
755 call deblock_%1_luma_intra_8
756 %endif
757 ADD esp, 16
758
759 mov r1, r1m
760 mov r0, r0mp
761 lea r3, [r1*3]
762 sub r0, 4
763 lea r2, [r0+r3]
764 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
765 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
766 lea r0, [r0+r1*8]
767 lea r2, [r2+r1*8]
768 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
769 RET
770 %endif ; ARCH_X86_64
771 %endmacro ; DEBLOCK_LUMA_INTRA
772
773 INIT_XMM sse2
774 DEBLOCK_LUMA_INTRA v
775 INIT_XMM avx
776 DEBLOCK_LUMA_INTRA v
777 %if ARCH_X86_64 == 0
778 INIT_MMX mmxext
779 DEBLOCK_LUMA_INTRA v8
780 %endif
781
782 INIT_MMX mmxext
783
784 %macro CHROMA_V_START 0
785 dec r2d ; alpha-1
786 dec r3d ; beta-1
787 mov t5, r0
788 sub t5, r1
789 sub t5, r1
790 %endmacro
791
792 %macro CHROMA_H_START 0
793 dec r2d
794 dec r3d
795 sub r0, 2
796 lea t6, [r1*3]
797 mov t5, r0
798 add r0, t6
799 %endmacro
800
801 %define t5 r5
802 %define t6 r6
803
804 ;-----------------------------------------------------------------------------
805 ; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta,
806 ; int8_t *tc0)
807 ;-----------------------------------------------------------------------------
808 cglobal deblock_v_chroma_8, 5,6
809 CHROMA_V_START
810 movq m0, [t5]
811 movq m1, [t5+r1]
812 movq m2, [r0]
813 movq m3, [r0+r1]
814 call ff_chroma_inter_body_mmxext
815 movq [t5+r1], m1
816 movq [r0], m2
817 RET
818
819 ;-----------------------------------------------------------------------------
820 ; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta,
821 ; int8_t *tc0)
822 ;-----------------------------------------------------------------------------
823 cglobal deblock_h_chroma_8, 5,7
824 %if ARCH_X86_64
825 ; This could use the red zone on 64 bit unix to avoid the stack pointer
826 ; readjustment, but valgrind assumes the red zone is clobbered on
827 ; function calls and returns.
828 sub rsp, 16
829 %define buf0 [rsp]
830 %define buf1 [rsp+8]
831 %else
832 %define buf0 r0m
833 %define buf1 r2m
834 %endif
835 CHROMA_H_START
836 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
837 movq buf0, m0
838 movq buf1, m3
839 call ff_chroma_inter_body_mmxext
840 movq m0, buf0
841 movq m3, buf1
842 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
843 %if ARCH_X86_64
844 add rsp, 16
845 %endif
846 RET
847
848 ALIGN 16
849 ff_chroma_inter_body_mmxext:
850 LOAD_MASK r2d, r3d
851 movd m6, [r4] ; tc0
852 punpcklbw m6, m6
853 pand m7, m6
854 DEBLOCK_P0_Q0
855 ret
856
857
858
859 ; in: %1=p0 %2=p1 %3=q1
860 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
861 %macro CHROMA_INTRA_P0 3
862 movq m4, %1
863 pxor m4, %3
864 pand m4, [pb_1] ; m4 = (p0^q1)&1
865 pavgb %1, %3
866 psubusb %1, m4
867 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
868 %endmacro
869
870 %define t5 r4
871 %define t6 r5
872
873 ;------------------------------------------------------------------------------
874 ; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
875 ;------------------------------------------------------------------------------
876 cglobal deblock_v_chroma_intra_8, 4,5
877 CHROMA_V_START
878 movq m0, [t5]
879 movq m1, [t5+r1]
880 movq m2, [r0]
881 movq m3, [r0+r1]
882 call ff_chroma_intra_body_mmxext
883 movq [t5+r1], m1
884 movq [r0], m2
885 RET
886
887 ;------------------------------------------------------------------------------
888 ; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
889 ;------------------------------------------------------------------------------
890 cglobal deblock_h_chroma_intra_8, 4,6
891 CHROMA_H_START
892 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
893 call ff_chroma_intra_body_mmxext
894 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
895 RET
896
897 ALIGN 16
898 ff_chroma_intra_body_mmxext:
899 LOAD_MASK r2d, r3d
900 movq m5, m1
901 movq m6, m2
902 CHROMA_INTRA_P0 m1, m0, m3
903 CHROMA_INTRA_P0 m2, m3, m0
904 psubb m1, m5
905 psubb m2, m6
906 pand m1, m7
907 pand m2, m7
908 paddb m1, m5
909 paddb m2, m6
910 ret
911
912 ;-----------------------------------------------------------------------------
913 ; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
914 ; int8_t ref[2][40], int16_t mv[2][40][2],
915 ; int bidir, int edges, int step,
916 ; int mask_mv0, int mask_mv1, int field);
917 ;
918 ; bidir is 0 or 1
919 ; edges is 1 or 4
920 ; step is 1 or 2
921 ; mask_mv0 is 0 or 3
922 ; mask_mv1 is 0 or 1
923 ; field is 0 or 1
924 ;-----------------------------------------------------------------------------
925 %macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
926 ; dir, d_idx, mask_dir, bidir
927 %define edgesd %1
928 %define stepd %2
929 %define mask_mvd %3
930 %define dir %4
931 %define d_idx %5
932 %define mask_dir %6
933 %define bidir %7
934 xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
935 %%.b_idx_loop:
936 %if mask_dir == 0
937 pxor m0, m0
938 %endif
939 test b_idxd, dword mask_mvd
940 jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv))
941 %if bidir == 1
942 movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
943 punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
944 pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] }
945 pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] }
946 pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] }
947 psubb m0, m2 ; { ref0[b] != ref0[bn],
948 ; ref0[b] != ref1[bn] }
949 psubb m1, m3 ; { ref1[b] != ref1[bn],
950 ; ref1[b] != ref0[bn] }
951
952 por m0, m1
953 mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
954 mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
955 mova m3, m1
956 mova m4, m2
957 psubw m1, [mvq+b_idxq*4+12*4]
958 psubw m2, [mvq+b_idxq*4+12*4+mmsize]
959 psubw m3, [mvq+b_idxq*4+52*4]
960 psubw m4, [mvq+b_idxq*4+52*4+mmsize]
961 packsswb m1, m2
962 packsswb m3, m4
963 paddb m1, m6
964 paddb m3, m6
965 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
966 psubusb m3, m5
967 packsswb m1, m3
968
969 por m0, m1
970 mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
971 mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
972 mova m3, m1
973 mova m4, m2
974 psubw m1, [mvq+b_idxq*4+12*4]
975 psubw m2, [mvq+b_idxq*4+12*4+mmsize]
976 psubw m3, [mvq+b_idxq*4+52*4]
977 psubw m4, [mvq+b_idxq*4+52*4+mmsize]
978 packsswb m1, m2
979 packsswb m3, m4
980 paddb m1, m6
981 paddb m3, m6
982 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
983 psubusb m3, m5
984 packsswb m1, m3
985
986 pshufw m1, m1, 0x4E
987 por m0, m1
988 pshufw m1, m0, 0x4E
989 pminub m0, m1
990 %else ; bidir == 0
991 movd m0, [refq+b_idxq+12]
992 psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
993
994 mova m1, [mvq+b_idxq*4+12*4]
995 mova m2, [mvq+b_idxq*4+12*4+mmsize]
996 psubw m1, [mvq+b_idxq*4+(d_idx+12)*4]
997 psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
998 packsswb m1, m2
999 paddb m1, m6
1000 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1001 packsswb m1, m1
1002 por m0, m1
1003 %endif ; bidir == 1/0
1004
1005 %%.skip_loop_iter:
1006 movd m1, [nnzq+b_idxq+12]
1007 por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
1008
1009 pminub m1, m7
1010 pminub m0, m7
1011 psllw m1, 1
1012 pxor m2, m2
1013 pmaxub m1, m0
1014 punpcklbw m1, m2
1015 movq [bsq+b_idxq+32*dir], m1
1016
1017 add b_idxd, dword stepd
1018 cmp b_idxd, dword edgesd
1019 jl %%.b_idx_loop
1020 %endmacro
1021
1022 INIT_MMX mmxext
1023 cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
1024 step, mask_mv0, mask_mv1, field
1025 %define b_idxq bidirq
1026 %define b_idxd bidird
1027 cmp dword fieldm, 0
1028 mova m7, [pb_1]
1029 mova m5, [pb_3]
1030 je .nofield
1031 mova m5, [pb_3_1]
1032 .nofield:
1033 mova m6, m5
1034 paddb m5, m5
1035
1036 shl dword stepd, 3
1037 shl dword edgesd, 3
1038 %if ARCH_X86_32
1039 %define mask_mv0d mask_mv0m
1040 %define mask_mv1d mask_mv1m
1041 %endif
1042 shl dword mask_mv1d, 3
1043 shl dword mask_mv0d, 3
1044
1045 cmp dword bidird, 0
1046 jne .bidir
1047 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0
1048 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0
1049
1050 mova m0, [bsq+mmsize*0]
1051 mova m1, [bsq+mmsize*1]
1052 mova m2, [bsq+mmsize*2]
1053 mova m3, [bsq+mmsize*3]
1054 TRANSPOSE4x4W 0, 1, 2, 3, 4
1055 mova [bsq+mmsize*0], m0
1056 mova [bsq+mmsize*1], m1
1057 mova [bsq+mmsize*2], m2
1058 mova [bsq+mmsize*3], m3
1059 RET
1060
1061 .bidir:
1062 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1
1063 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1
1064
1065 mova m0, [bsq+mmsize*0]
1066 mova m1, [bsq+mmsize*1]
1067 mova m2, [bsq+mmsize*2]
1068 mova m3, [bsq+mmsize*3]
1069 TRANSPOSE4x4W 0, 1, 2, 3, 4
1070 mova [bsq+mmsize*0], m0
1071 mova [bsq+mmsize*1], m1
1072 mova [bsq+mmsize*2], m2
1073 mova [bsq+mmsize*3], m3
1074 RET