relicense h264 deblock sse2 to lgpl
[libav.git] / libavcodec / x86 / h264_deblock_sse2.asm
1 ;*****************************************************************************
2 ;* MMX/SSE2-optimized H.264 deblocking code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;*
8 ;* This file is part of FFmpeg.
9 ;*
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
14 ;*
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
19 ;*
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
24
25 %include "x86inc.asm"
26
27 SECTION_RODATA
28 pb_00: times 16 db 0x00
29 pb_01: times 16 db 0x01
30 pb_03: times 16 db 0x03
31 pb_a1: times 16 db 0xa1
32
33 SECTION .text
34
35 ; expands to [base],...,[base+7*stride]
36 %define PASS8ROWS(base, base3, stride, stride3) \
37 [base], [base+stride], [base+stride*2], [base3], \
38 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
39
40 ; in: 8 rows of 4 bytes in %1..%8
41 ; out: 4 rows of 8 bytes in m0..m3
42 %macro TRANSPOSE4x8_LOAD 8
43 movd m0, %1
44 movd m2, %2
45 movd m1, %3
46 movd m3, %4
47 punpcklbw m0, m2
48 punpcklbw m1, m3
49 movq m2, m0
50 punpcklwd m0, m1
51 punpckhwd m2, m1
52
53 movd m4, %5
54 movd m6, %6
55 movd m5, %7
56 movd m7, %8
57 punpcklbw m4, m6
58 punpcklbw m5, m7
59 movq m6, m4
60 punpcklwd m4, m5
61 punpckhwd m6, m5
62
63 movq m1, m0
64 movq m3, m2
65 punpckldq m0, m4
66 punpckhdq m1, m4
67 punpckldq m2, m6
68 punpckhdq m3, m6
69 %endmacro
70
71 ; in: 4 rows of 8 bytes in m0..m3
72 ; out: 8 rows of 4 bytes in %1..%8
73 %macro TRANSPOSE8x4_STORE 8
74 movq m4, m0
75 movq m5, m1
76 movq m6, m2
77 punpckhdq m4, m4
78 punpckhdq m5, m5
79 punpckhdq m6, m6
80
81 punpcklbw m0, m1
82 punpcklbw m2, m3
83 movq m1, m0
84 punpcklwd m0, m2
85 punpckhwd m1, m2
86 movd %1, m0
87 punpckhdq m0, m0
88 movd %2, m0
89 movd %3, m1
90 punpckhdq m1, m1
91 movd %4, m1
92
93 punpckhdq m3, m3
94 punpcklbw m4, m5
95 punpcklbw m6, m3
96 movq m5, m4
97 punpcklwd m4, m6
98 punpckhwd m5, m6
99 movd %5, m4
100 punpckhdq m4, m4
101 movd %6, m4
102 movd %7, m5
103 punpckhdq m5, m5
104 movd %8, m5
105 %endmacro
106
107 %macro SBUTTERFLY 4
108 movq %4, %2
109 punpckl%1 %2, %3
110 punpckh%1 %4, %3
111 %endmacro
112
113 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
114 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
115 %macro TRANSPOSE6x8_MEM 9
116 movq m0, %1
117 movq m1, %2
118 movq m2, %3
119 movq m3, %4
120 movq m4, %5
121 movq m5, %6
122 movq m6, %7
123 SBUTTERFLY bw, m0, m1, m7
124 SBUTTERFLY bw, m2, m3, m1
125 SBUTTERFLY bw, m4, m5, m3
126 movq [%9+0x10], m1
127 SBUTTERFLY bw, m6, %8, m5
128 SBUTTERFLY wd, m0, m2, m1
129 SBUTTERFLY wd, m4, m6, m2
130 punpckhdq m0, m4
131 movq [%9+0x00], m0
132 SBUTTERFLY wd, m7, [%9+0x10], m6
133 SBUTTERFLY wd, m3, m5, m4
134 SBUTTERFLY dq, m7, m3, m0
135 SBUTTERFLY dq, m1, m2, m5
136 punpckldq m6, m4
137 movq [%9+0x10], m1
138 movq [%9+0x20], m5
139 movq [%9+0x30], m7
140 movq [%9+0x40], m0
141 movq [%9+0x50], m6
142 %endmacro
143
144 ; in: 8 rows of 8 in %1..%8
145 ; out: 8 rows of 8 in %9..%16
146 %macro TRANSPOSE8x8_MEM 16
147 movq m0, %1
148 movq m1, %2
149 movq m2, %3
150 movq m3, %4
151 movq m4, %5
152 movq m5, %6
153 movq m6, %7
154 SBUTTERFLY bw, m0, m1, m7
155 SBUTTERFLY bw, m2, m3, m1
156 SBUTTERFLY bw, m4, m5, m3
157 SBUTTERFLY bw, m6, %8, m5
158 movq %9, m3
159 SBUTTERFLY wd, m0, m2, m3
160 SBUTTERFLY wd, m4, m6, m2
161 SBUTTERFLY wd, m7, m1, m6
162 movq %11, m2
163 movq m2, %9
164 SBUTTERFLY wd, m2, m5, m1
165 SBUTTERFLY dq, m0, m4, m5
166 SBUTTERFLY dq, m7, m2, m4
167 movq %9, m0
168 movq %10, m5
169 movq %13, m7
170 movq %14, m4
171 SBUTTERFLY dq, m3, %11, m0
172 SBUTTERFLY dq, m6, m1, m5
173 movq %11, m3
174 movq %12, m0
175 movq %15, m6
176 movq %16, m5
177 %endmacro
178
179 ; out: %4 = |%1-%2|>%3
180 ; clobbers: %5
181 %macro DIFF_GT 5
182 mova %5, %2
183 mova %4, %1
184 psubusb %5, %1
185 psubusb %4, %2
186 por %4, %5
187 psubusb %4, %3
188 %endmacro
189
190 ; out: %4 = |%1-%2|>%3
191 ; clobbers: %5
192 %macro DIFF_GT2 5
193 mova %5, %2
194 mova %4, %1
195 psubusb %5, %1
196 psubusb %4, %2
197 psubusb %5, %3
198 psubusb %4, %3
199 pcmpeqb %4, %5
200 %endmacro
201
202 %macro SPLATW 1
203 %ifidn m0, xmm0
204 pshuflw %1, %1, 0
205 punpcklqdq %1, %1
206 %else
207 pshufw %1, %1, 0
208 %endif
209 %endmacro
210
211 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
212 ; out: m5=beta-1, m7=mask, %3=alpha-1
213 ; clobbers: m4,m6
214 %macro LOAD_MASK 2-3
215 movd m4, %1
216 movd m5, %2
217 SPLATW m4
218 SPLATW m5
219 packuswb m4, m4 ; 16x alpha-1
220 packuswb m5, m5 ; 16x beta-1
221 %if %0>2
222 mova %3, m4
223 %endif
224 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
225 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
226 por m7, m4
227 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
228 por m7, m4
229 pxor m6, m6
230 pcmpeqb m7, m6
231 %endmacro
232
233 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
234 ; out: m1=p0' m2=q0'
235 ; clobbers: m0,3-6
236 %macro DEBLOCK_P0_Q0 0
237 mova m5, m1
238 pxor m5, m2 ; p0^q0
239 pand m5, [pb_01] ; (p0^q0)&1
240 pcmpeqb m4, m4
241 pxor m3, m4
242 pavgb m3, m0 ; (p1 - q1 + 256)>>1
243 pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
244 pxor m4, m1
245 pavgb m4, m2 ; (q0 - p0 + 256)>>1
246 pavgb m3, m5
247 paddusb m3, m4 ; d+128+33
248 mova m6, [pb_a1]
249 psubusb m6, m3
250 psubusb m3, [pb_a1]
251 pminub m6, m7
252 pminub m3, m7
253 psubusb m1, m6
254 psubusb m2, m3
255 paddusb m1, m3
256 paddusb m2, m6
257 %endmacro
258
259 ; in: m1=p0 m2=q0
260 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
261 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
262 ; clobbers: q2, tmp, tc0
263 %macro LUMA_Q1 6
264 mova %6, m1
265 pavgb %6, m2
266 pavgb %2, %6 ; avg(p2,avg(p0,q0))
267 pxor %6, %3
268 pand %6, [pb_01] ; (p2^avg(p0,q0))&1
269 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
270 mova %6, %1
271 psubusb %6, %5
272 paddusb %5, %1
273 pmaxub %2, %6
274 pminub %2, %5
275 mova %4, %2
276 %endmacro
277
278 %ifdef ARCH_X86_64
279 ;-----------------------------------------------------------------------------
280 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
281 ;-----------------------------------------------------------------------------
282 INIT_XMM
283 cglobal x264_deblock_v_luma_sse2, 5,5,10
284 movd m8, [r4] ; tc0
285 lea r4, [r1*3]
286 dec r2d ; alpha-1
287 neg r4
288 dec r3d ; beta-1
289 add r4, r0 ; pix-3*stride
290
291 mova m0, [r4+r1] ; p1
292 mova m1, [r4+2*r1] ; p0
293 mova m2, [r0] ; q0
294 mova m3, [r0+r1] ; q1
295 LOAD_MASK r2d, r3d
296
297 punpcklbw m8, m8
298 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
299 pcmpeqb m9, m9
300 pcmpeqb m9, m8
301 pandn m9, m7
302 pand m8, m9
303
304 movdqa m3, [r4] ; p2
305 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
306 pand m6, m9
307 mova m7, m8
308 psubb m7, m6
309 pand m6, m8
310 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
311
312 movdqa m4, [r0+2*r1] ; q2
313 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
314 pand m6, m9
315 pand m8, m6
316 psubb m7, m6
317 mova m3, [r0+r1]
318 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
319
320 DEBLOCK_P0_Q0
321 mova [r4+2*r1], m1
322 mova [r0], m2
323 RET
324
325 ;-----------------------------------------------------------------------------
326 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
327 ;-----------------------------------------------------------------------------
328 INIT_MMX
329 cglobal x264_deblock_h_luma_sse2, 5,7
330 movsxd r10, r1d
331 lea r11, [r10+r10*2]
332 lea r6, [r0-4]
333 lea r5, [r0-4+r11]
334 %ifdef WIN64
335 sub rsp, 0x98
336 %define pix_tmp rsp+0x30
337 %else
338 sub rsp, 0x68
339 %define pix_tmp rsp
340 %endif
341
342 ; transpose 6x16 -> tmp space
343 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
344 lea r6, [r6+r10*8]
345 lea r5, [r5+r10*8]
346 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
347
348 ; vertical filter
349 ; alpha, beta, tc0 are still in r2d, r3d, r4
350 ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
351 lea r0, [pix_tmp+0x30]
352 mov r1d, 0x10
353 %ifdef WIN64
354 mov [rsp+0x20], r4
355 %endif
356 call x264_deblock_v_luma_sse2
357
358 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
359 add r6, 2
360 add r5, 2
361 movq m0, [pix_tmp+0x18]
362 movq m1, [pix_tmp+0x28]
363 movq m2, [pix_tmp+0x38]
364 movq m3, [pix_tmp+0x48]
365 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
366
367 shl r10, 3
368 sub r6, r10
369 sub r5, r10
370 shr r10, 3
371 movq m0, [pix_tmp+0x10]
372 movq m1, [pix_tmp+0x20]
373 movq m2, [pix_tmp+0x30]
374 movq m3, [pix_tmp+0x40]
375 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
376
377 %ifdef WIN64
378 add rsp, 0x98
379 %else
380 add rsp, 0x68
381 %endif
382 RET
383
384 %else
385
386 %macro DEBLOCK_LUMA 3
387 ;-----------------------------------------------------------------------------
388 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
389 ;-----------------------------------------------------------------------------
390 cglobal x264_deblock_%2_luma_%1, 5,5
391 lea r4, [r1*3]
392 dec r2 ; alpha-1
393 neg r4
394 dec r3 ; beta-1
395 add r4, r0 ; pix-3*stride
396 %assign pad 2*%3+12-(stack_offset&15)
397 SUB esp, pad
398
399 mova m0, [r4+r1] ; p1
400 mova m1, [r4+2*r1] ; p0
401 mova m2, [r0] ; q0
402 mova m3, [r0+r1] ; q1
403 LOAD_MASK r2, r3
404
405 mov r3, r4mp
406 movd m4, [r3] ; tc0
407 punpcklbw m4, m4
408 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
409 mova [esp+%3], m4 ; tc
410 pcmpeqb m3, m3
411 pcmpgtb m4, m3
412 pand m4, m7
413 mova [esp], m4 ; mask
414
415 mova m3, [r4] ; p2
416 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
417 pand m6, m4
418 pand m4, [esp+%3] ; tc
419 mova m7, m4
420 psubb m7, m6
421 pand m6, m4
422 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
423
424 mova m4, [r0+2*r1] ; q2
425 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
426 mova m5, [esp] ; mask
427 pand m6, m5
428 mova m5, [esp+%3] ; tc
429 pand m5, m6
430 psubb m7, m6
431 mova m3, [r0+r1]
432 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
433
434 DEBLOCK_P0_Q0
435 mova [r4+2*r1], m1
436 mova [r0], m2
437 ADD esp, pad
438 RET
439
440 ;-----------------------------------------------------------------------------
441 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
442 ;-----------------------------------------------------------------------------
443 INIT_MMX
444 cglobal x264_deblock_h_luma_%1, 0,5
445 mov r0, r0mp
446 mov r3, r1m
447 lea r4, [r3*3]
448 sub r0, 4
449 lea r1, [r0+r4]
450 %assign pad 0x78-(stack_offset&15)
451 SUB esp, pad
452 %define pix_tmp esp+12
453
454 ; transpose 6x16 -> tmp space
455 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
456 lea r0, [r0+r3*8]
457 lea r1, [r1+r3*8]
458 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
459
460 ; vertical filter
461 lea r0, [pix_tmp+0x30]
462 PUSH dword r4m
463 PUSH dword r3m
464 PUSH dword r2m
465 PUSH dword 16
466 PUSH dword r0
467 call x264_deblock_%2_luma_%1
468 %ifidn %2, v8
469 add dword [esp ], 8 ; pix_tmp+0x38
470 add dword [esp+16], 2 ; tc0+2
471 call x264_deblock_%2_luma_%1
472 %endif
473 ADD esp, 20
474
475 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
476 mov r0, r0mp
477 sub r0, 2
478 lea r1, [r0+r4]
479
480 movq m0, [pix_tmp+0x10]
481 movq m1, [pix_tmp+0x20]
482 movq m2, [pix_tmp+0x30]
483 movq m3, [pix_tmp+0x40]
484 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
485
486 lea r0, [r0+r3*8]
487 lea r1, [r1+r3*8]
488 movq m0, [pix_tmp+0x18]
489 movq m1, [pix_tmp+0x28]
490 movq m2, [pix_tmp+0x38]
491 movq m3, [pix_tmp+0x48]
492 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
493
494 ADD esp, pad
495 RET
496 %endmacro ; DEBLOCK_LUMA
497
498 INIT_XMM
499 DEBLOCK_LUMA sse2, v, 16
500
501 %endif ; ARCH
502
503
504
505 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
506 mova t0, p2
507 mova t1, p0
508 pavgb t0, p1
509 pavgb t1, q0
510 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
511 mova t5, t1
512 mova t2, p2
513 mova t3, p0
514 paddb t2, p1
515 paddb t3, q0
516 paddb t2, t3
517 mova t3, t2
518 mova t4, t2
519 psrlw t2, 1
520 pavgb t2, mpb_00
521 pxor t2, t0
522 pand t2, mpb_01
523 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
524
525 mova t1, p2
526 mova t2, p2
527 pavgb t1, q1
528 psubb t2, q1
529 paddb t3, t3
530 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
531 pand t2, mpb_01
532 psubb t1, t2
533 pavgb t1, p1
534 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
535 psrlw t3, 2
536 pavgb t3, mpb_00
537 pxor t3, t1
538 pand t3, mpb_01
539 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
540
541 mova t3, p0
542 mova t2, p0
543 pxor t3, q1
544 pavgb t2, q1
545 pand t3, mpb_01
546 psubb t2, t3
547 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
548
549 pxor t1, t2
550 pxor t2, p0
551 pand t1, mask1p
552 pand t2, mask0
553 pxor t1, t2
554 pxor t1, p0
555 mova %1, t1 ; store p0
556
557 mova t1, %4 ; p3
558 mova t2, t1
559 pavgb t1, p2
560 paddb t2, p2
561 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
562 paddb t2, t2
563 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
564 psrlw t2, 2
565 pavgb t2, mpb_00
566 pxor t2, t1
567 pand t2, mpb_01
568 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
569
570 pxor t0, p1
571 pxor t1, p2
572 pand t0, mask1p
573 pand t1, mask1p
574 pxor t0, p1
575 pxor t1, p2
576 mova %2, t0 ; store p1
577 mova %3, t1 ; store p2
578 %endmacro
579
580 %macro LUMA_INTRA_SWAP_PQ 0
581 %define q1 m0
582 %define q0 m1
583 %define p0 m2
584 %define p1 m3
585 %define p2 q2
586 %define mask1p mask1q
587 %endmacro
588
589 %macro DEBLOCK_LUMA_INTRA 2
590 %define p1 m0
591 %define p0 m1
592 %define q0 m2
593 %define q1 m3
594 %define t0 m4
595 %define t1 m5
596 %define t2 m6
597 %define t3 m7
598 %ifdef ARCH_X86_64
599 %define p2 m8
600 %define q2 m9
601 %define t4 m10
602 %define t5 m11
603 %define mask0 m12
604 %define mask1p m13
605 %define mask1q [rsp-24]
606 %define mpb_00 m14
607 %define mpb_01 m15
608 %else
609 %define spill(x) [esp+16*x+((stack_offset+4)&15)]
610 %define p2 [r4+r1]
611 %define q2 [r0+2*r1]
612 %define t4 spill(0)
613 %define t5 spill(1)
614 %define mask0 spill(2)
615 %define mask1p spill(3)
616 %define mask1q spill(4)
617 %define mpb_00 [pb_00]
618 %define mpb_01 [pb_01]
619 %endif
620
621 ;-----------------------------------------------------------------------------
622 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
623 ;-----------------------------------------------------------------------------
624 cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
625 %ifndef ARCH_X86_64
626 sub esp, 0x60
627 %endif
628 lea r4, [r1*4]
629 lea r5, [r1*3] ; 3*stride
630 dec r2d ; alpha-1
631 jl .end
632 neg r4
633 dec r3d ; beta-1
634 jl .end
635 add r4, r0 ; pix-4*stride
636 mova p1, [r4+2*r1]
637 mova p0, [r4+r5]
638 mova q0, [r0]
639 mova q1, [r0+r1]
640 %ifdef ARCH_X86_64
641 pxor mpb_00, mpb_00
642 mova mpb_01, [pb_01]
643 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
644 SWAP 7, 12 ; m12=mask0
645 pavgb t5, mpb_00
646 pavgb t5, mpb_01 ; alpha/4+1
647 movdqa p2, [r4+r1]
648 movdqa q2, [r0+2*r1]
649 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
650 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
651 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
652 pand t0, mask0
653 pand t4, t0
654 pand t2, t0
655 mova mask1q, t4
656 mova mask1p, t2
657 %else
658 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
659 mova m4, t5
660 mova mask0, m7
661 pavgb m4, [pb_00]
662 pavgb m4, [pb_01] ; alpha/4+1
663 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
664 pand m6, mask0
665 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
666 pand m4, m6
667 mova mask1p, m4
668 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
669 pand m4, m6
670 mova mask1q, m4
671 %endif
672 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
673 LUMA_INTRA_SWAP_PQ
674 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
675 .end:
676 %ifndef ARCH_X86_64
677 add esp, 0x60
678 %endif
679 RET
680
681 INIT_MMX
682 %ifdef ARCH_X86_64
683 ;-----------------------------------------------------------------------------
684 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
685 ;-----------------------------------------------------------------------------
686 cglobal x264_deblock_h_luma_intra_%1, 4,7
687 movsxd r10, r1d
688 lea r11, [r10*3]
689 lea r6, [r0-4]
690 lea r5, [r0-4+r11]
691 sub rsp, 0x88
692 %define pix_tmp rsp
693
694 ; transpose 8x16 -> tmp space
695 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
696 lea r6, [r6+r10*8]
697 lea r5, [r5+r10*8]
698 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
699
700 lea r0, [pix_tmp+0x40]
701 mov r1, 0x10
702 call x264_deblock_v_luma_intra_%1
703
704 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
705 lea r5, [r6+r11]
706 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
707 shl r10, 3
708 sub r6, r10
709 sub r5, r10
710 shr r10, 3
711 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
712 add rsp, 0x88
713 RET
714 %else
715 cglobal x264_deblock_h_luma_intra_%1, 2,4
716 lea r3, [r1*3]
717 sub r0, 4
718 lea r2, [r0+r3]
719 %assign pad 0x8c-(stack_offset&15)
720 SUB rsp, pad
721 %define pix_tmp rsp
722
723 ; transpose 8x16 -> tmp space
724 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
725 lea r0, [r0+r1*8]
726 lea r2, [r2+r1*8]
727 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
728
729 lea r0, [pix_tmp+0x40]
730 PUSH dword r3m
731 PUSH dword r2m
732 PUSH dword 16
733 PUSH r0
734 call x264_deblock_%2_luma_intra_%1
735 %ifidn %2, v8
736 add dword [rsp], 8 ; pix_tmp+8
737 call x264_deblock_%2_luma_intra_%1
738 %endif
739 ADD esp, 16
740
741 mov r1, r1m
742 mov r0, r0mp
743 lea r3, [r1*3]
744 sub r0, 4
745 lea r2, [r0+r3]
746 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
747 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
748 lea r0, [r0+r1*8]
749 lea r2, [r2+r1*8]
750 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
751 ADD rsp, pad
752 RET
753 %endif ; ARCH_X86_64
754 %endmacro ; DEBLOCK_LUMA_INTRA
755
756 INIT_XMM
757 DEBLOCK_LUMA_INTRA sse2, v
758 %ifndef ARCH_X86_64
759 INIT_MMX
760 DEBLOCK_LUMA_INTRA mmxext, v8
761 %endif