x86: deduplicate some constants
[libav.git] / libavcodec / x86 / hevc_deblock.asm
1 ;*****************************************************************************
2 ;* SSE2-optimized HEVC deblocking code
3 ;*****************************************************************************
4 ;* Copyright (C) 2013 VTT
5 ;*
6 ;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
7 ;*
8 ;* This file is part of Libav.
9 ;*
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
14 ;*
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
19 ;*
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
24
25 %include "libavutil/x86/x86util.asm"
26
27 SECTION_RODATA
28
29 pw_pixel_max: times 8 dw ((1 << 10)-1)
30 pw_m2: times 8 dw -2
31 pd_1 : times 4 dd 1
32
33 cextern pw_4
34 cextern pw_8
35 cextern pw_m1
36
37 SECTION .text
38 INIT_XMM sse2
39
40 ; expands to [base],...,[base+7*stride]
41 %define PASS8ROWS(base, base3, stride, stride3) \
42 [base], [base+stride], [base+stride*2], [base3], \
43 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
44
45 ; in: 8 rows of 4 bytes in %4..%11
46 ; out: 4 rows of 8 words in m0..m3
47 %macro TRANSPOSE4x8B_LOAD 8
48 movd m0, %1
49 movd m2, %2
50 movd m1, %3
51 movd m3, %4
52
53 punpcklbw m0, m2
54 punpcklbw m1, m3
55 punpcklwd m0, m1
56
57 movd m4, %5
58 movd m6, %6
59 movd m5, %7
60 movd m7, %8
61
62 punpcklbw m4, m6
63 punpcklbw m5, m7
64 punpcklwd m4, m5
65
66 punpckhdq m2, m0, m4
67 punpckldq m0, m4
68
69 pxor m5, m5
70 punpckhbw m1, m0, m5
71 punpcklbw m0, m5
72 punpckhbw m3, m2, m5
73 punpcklbw m2, m5
74 %endmacro
75
76 ; in: 4 rows of 8 words in m0..m3
77 ; out: 8 rows of 4 bytes in %1..%8
78 %macro TRANSPOSE8x4B_STORE 8
79 packuswb m0, m0
80 packuswb m1, m1
81 packuswb m2, m2
82 packuswb m3, m3
83
84 punpcklbw m0, m1
85 punpcklbw m2, m3
86
87 punpckhwd m6, m0, m2
88 punpcklwd m0, m2
89
90 movd %1, m0
91 pshufd m0, m0, 0x39
92 movd %2, m0
93 pshufd m0, m0, 0x39
94 movd %3, m0
95 pshufd m0, m0, 0x39
96 movd %4, m0
97
98 movd %5, m6
99 pshufd m6, m6, 0x39
100 movd %6, m6
101 pshufd m6, m6, 0x39
102 movd %7, m6
103 pshufd m6, m6, 0x39
104 movd %8, m6
105 %endmacro
106
107 ; in: 8 rows of 4 words in %4..%11
108 ; out: 4 rows of 8 words in m0..m3
109 %macro TRANSPOSE4x8W_LOAD 8
110 movq m0, %1
111 movq m2, %2
112 movq m1, %3
113 movq m3, %4
114
115 punpcklwd m0, m2
116 punpcklwd m1, m3
117 punpckhdq m2, m0, m1
118 punpckldq m0, m1
119
120 movq m4, %5
121 movq m6, %6
122 movq m5, %7
123 movq m7, %8
124
125 punpcklwd m4, m6
126 punpcklwd m5, m7
127 punpckhdq m6, m4, m5
128 punpckldq m4, m5
129
130 punpckhqdq m1, m0, m4
131 punpcklqdq m0, m4
132 punpckhqdq m3, m2, m6
133 punpcklqdq m2, m6
134
135 %endmacro
136
137 ; in: 4 rows of 8 words in m0..m3
138 ; out: 8 rows of 4 words in %1..%8
139 %macro TRANSPOSE8x4W_STORE 8
140 pxor m5, m5; zeros reg
141 CLIPW m0, m5, [pw_pixel_max]
142 CLIPW m1, m5, [pw_pixel_max]
143 CLIPW m2, m5, [pw_pixel_max]
144 CLIPW m3, m5, [pw_pixel_max]
145
146 punpckhwd m4, m0, m1
147 punpcklwd m0, m1
148 punpckhwd m5, m2, m3
149 punpcklwd m2, m3
150 punpckhdq m6, m0, m2
151 punpckldq m0, m2
152
153 movq %1, m0
154 movhps %2, m0
155 movq %3, m6
156 movhps %4, m6
157
158 punpckhdq m6, m4, m5
159 punpckldq m4, m5
160
161 movq %5, m4
162 movhps %6, m4
163 movq %7, m6
164 movhps %8, m6
165 %endmacro
166
167 ; in: 8 rows of 8 bytes in %1..%8
168 ; out: 8 rows of 8 words in m0..m7
169 %macro TRANSPOSE8x8B_LOAD 8
170 movq m7, %1
171 movq m2, %2
172 movq m1, %3
173 movq m3, %4
174
175 punpcklbw m7, m2
176 punpcklbw m1, m3
177 punpcklwd m3, m7, m1
178 punpckhwd m7, m1
179
180 movq m4, %5
181 movq m6, %6
182 movq m5, %7
183 movq m15, %8
184
185 punpcklbw m4, m6
186 punpcklbw m5, m15
187 punpcklwd m9, m4, m5
188 punpckhwd m4, m5
189
190 punpckldq m1, m3, m9; 0, 1
191 punpckhdq m3, m9; 2, 3
192
193 punpckldq m5, m7, m4; 4, 5
194 punpckhdq m7, m4; 6, 7
195
196 pxor m13, m13
197
198 punpcklbw m0, m1, m13; 0 in 16 bit
199 punpckhbw m1, m13; 1 in 16 bit
200
201 punpcklbw m2, m3, m13; 2
202 punpckhbw m3, m13; 3
203
204 punpcklbw m4, m5, m13; 4
205 punpckhbw m5, m13; 5
206
207 punpcklbw m6, m7, m13; 6
208 punpckhbw m7, m13; 7
209 %endmacro
210
211
212 ; in: 8 rows of 8 words in m0..m8
213 ; out: 8 rows of 8 bytes in %1..%8
214 %macro TRANSPOSE8x8B_STORE 8
215 packuswb m0, m0
216 packuswb m1, m1
217 packuswb m2, m2
218 packuswb m3, m3
219 packuswb m4, m4
220 packuswb m5, m5
221 packuswb m6, m6
222 packuswb m7, m7
223
224 punpcklbw m0, m1
225 punpcklbw m2, m3
226
227 punpckhwd m8, m0, m2
228 punpcklwd m0, m2
229
230 punpcklbw m4, m5
231 punpcklbw m6, m7
232
233 punpckhwd m9, m4, m6
234 punpcklwd m4, m6
235
236 punpckhdq m10, m0, m4; 2, 3
237 punpckldq m0, m4; 0, 1
238
239 punpckldq m11, m8, m9; 4, 5
240 punpckhdq m8, m9; 6, 7
241 movq %1, m0
242 movhps %2, m0
243 movq %3, m10
244 movhps %4, m10
245 movq %5, m11
246 movhps %6, m11
247 movq %7, m8
248 movhps %8, m8
249 %endmacro
250
251 ; in: 8 rows of 8 words in %1..%8
252 ; out: 8 rows of 8 words in m0..m7
253 %macro TRANSPOSE8x8W_LOAD 8
254 movdqu m0, %1
255 movdqu m1, %2
256 movdqu m2, %3
257 movdqu m3, %4
258 movdqu m4, %5
259 movdqu m5, %6
260 movdqu m6, %7
261 movdqu m7, %8
262 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
263 %endmacro
264
265 ; in: 8 rows of 8 words in m0..m8
266 ; out: 8 rows of 8 words in %1..%8
267 %macro TRANSPOSE8x8W_STORE 8
268 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
269
270 pxor m8, m8
271 CLIPW m0, m8, [pw_pixel_max]
272 CLIPW m1, m8, [pw_pixel_max]
273 CLIPW m2, m8, [pw_pixel_max]
274 CLIPW m3, m8, [pw_pixel_max]
275 CLIPW m4, m8, [pw_pixel_max]
276 CLIPW m5, m8, [pw_pixel_max]
277 CLIPW m6, m8, [pw_pixel_max]
278 CLIPW m7, m8, [pw_pixel_max]
279
280 movdqu %1, m0
281 movdqu %2, m1
282 movdqu %3, m2
283 movdqu %4, m3
284 movdqu %5, m4
285 movdqu %6, m5
286 movdqu %7, m6
287 movdqu %8, m7
288 %endmacro
289
290
291 ; in: %2 clobbered
292 ; out: %1
293 ; mask in m11
294 ; clobbers m10
295 %macro MASKED_COPY 2
296 pand %2, m11 ; and mask
297 pandn m10, m11, %1; and -mask
298 por %2, m10
299 mova %1, %2
300 %endmacro
301
302 ; in: %2 clobbered
303 ; out: %1
304 ; mask in %3, will be clobbered
305 %macro MASKED_COPY2 3
306 pand %2, %3 ; and mask
307 pandn %3, %1; and -mask
308 por %2, %3
309 mova %1, %2
310 %endmacro
311
312 ALIGN 16
313 ; input in m0 ... m3 and tcs in r2. Output in m1 and m2
314 %macro CHROMA_DEBLOCK_BODY 1
315 psubw m4, m2, m1; q0 - p0
316 psubw m5, m0, m3; p1 - q1
317 psllw m4, 2; << 2
318 paddw m5, m4;
319
320 ;tc calculations
321 movd m6, [r2]; tc0
322 add r2, 4;
323 punpcklwd m6, m6
324 movd m7, [r2]; tc1
325 punpcklwd m7, m7
326 shufps m6, m7, 0; tc0, tc1
327 pmullw m4, m6, [pw_m1]; -tc0, -tc1
328 ;end tc calculations
329
330 paddw m5, [pw_4]; +4
331 psraw m5, 3; >> 3
332
333 %if %1 > 8
334 psllw m4, %1-8; << (BIT_DEPTH - 8)
335 psllw m6, %1-8; << (BIT_DEPTH - 8)
336 %endif
337 pmaxsw m5, m4
338 pminsw m5, m6
339 paddw m1, m5; p0 + delta0
340 psubw m2, m5; q0 - delta0
341 %endmacro
342
343 ; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
344 %macro LUMA_DEBLOCK_BODY 2
345 psllw m9, m2, 1; *2
346 psubw m10, m1, m9
347 paddw m10, m3
348 ABS1 m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
349
350 psllw m9, m5, 1; *2
351 psubw m11, m6, m9
352 paddw m11, m4
353 ABS1 m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
354
355 ;beta calculations
356 %if %1 > 8
357 shl betaq, %1 - 8
358 %endif
359 movd m13, betad
360 SPLATW m13, m13, 0
361 ;end beta calculations
362
363 paddw m9, m10, m11; 0d0, 0d3 , 1d0, 1d3
364
365 pshufhw m14, m9, q0033 ;0b00001111; 0d3 0d3 0d0 0d0 in high
366 pshuflw m14, m14, q0033 ;0b00001111; 1d3 1d3 1d0 1d0 in low
367
368 pshufhw m9, m9, q3300 ;0b11110000; 0d0 0d0 0d3 0d3
369 pshuflw m9, m9, q3300 ;0b11110000; 1d0 1d0 1d3 1d3
370
371 paddw m14, m9; 0d0+0d3, 1d0+1d3
372
373 ;compare
374 pcmpgtw m15, m13, m14
375 movmskps r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
376 test r13, r13
377 je .bypassluma
378
379 ;weak / strong decision compare to beta_2
380 psraw m15, m13, 2; beta >> 2
381 psllw m8, m9, 1;
382 pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
383 movmskps r14, m15;
384 ;end weak / strong decision
385
386 ; weak filter nd_p/q calculation
387 pshufd m8, m10, 0x31
388 psrld m8, 16
389 paddw m8, m10
390 movd r7d, m8
391 and r7, 0xffff; 1dp0 + 1dp3
392 pshufd m8, m8, 0x4E
393 movd r8d, m8
394 and r8, 0xffff; 0dp0 + 0dp3
395
396 pshufd m8, m11, 0x31
397 psrld m8, 16
398 paddw m8, m11
399 movd r9d, m8
400 and r9, 0xffff; 1dq0 + 1dq3
401 pshufd m8, m8, 0x4E
402 movd r10d, m8
403 and r10, 0xffff; 0dq0 + 0dq3
404 ; end calc for weak filter
405
406 ; filtering mask
407 mov r11, r13
408 shr r11, 3
409 movd m15, r11d
410 and r13, 1
411 movd m11, r13d
412 shufps m11, m15, 0
413 shl r11, 1
414 or r13, r11
415
416 pcmpeqd m11, [pd_1]; filtering mask
417
418 ;decide between strong and weak filtering
419 ;tc25 calculations
420 mov r11d, [tcq];
421 %if %1 > 8
422 shl r11, %1 - 8
423 %endif
424 movd m8, r11d; tc0
425 add tcq, 4;
426 mov r3d, [tcq];
427 %if %1 > 8
428 shl r3, %1 - 8
429 %endif
430 movd m9, r3d; tc1
431 add r11d, r3d; tc0 + tc1
432 jz .bypassluma
433 punpcklwd m8, m8
434 punpcklwd m9, m9
435 shufps m8, m9, 0; tc0, tc1
436 mova m9, m8
437 psllw m8, 2; tc << 2
438 pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1)
439 ;end tc25 calculations
440
441 ;----beta_3 comparison-----
442 psubw m12, m0, m3; p3 - p0
443 ABS1 m12, m14; abs(p3 - p0)
444
445 psubw m15, m7, m4; q3 - q0
446 ABS1 m15, m14; abs(q3 - q0)
447
448 paddw m12, m15; abs(p3 - p0) + abs(q3 - q0)
449
450 pshufhw m12, m12, 0xf0 ;0b11110000;
451 pshuflw m12, m12, 0xf0 ;0b11110000;
452
453 psraw m13, 3; beta >> 3
454 pcmpgtw m13, m12;
455 movmskps r11, m13;
456 and r14, r11; strong mask , beta_2 and beta_3 comparisons
457 ;----beta_3 comparison end-----
458 ;----tc25 comparison---
459 psubw m12, m3, m4; p0 - q0
460 ABS1 m12, m14; abs(p0 - q0)
461
462 pshufhw m12, m12, 0xf0 ;0b11110000;
463 pshuflw m12, m12, 0xf0 ;0b11110000;
464
465 pcmpgtw m8, m12; tc25 comparisons
466 movmskps r11, m8;
467 and r14, r11; strong mask, beta_2, beta_3 and tc25 comparisons
468 ;----tc25 comparison end---
469 mov r11, r14;
470 shr r11, 1;
471 and r14, r11; strong mask, bits 2 and 0
472
473 pmullw m14, m9, [pw_m2]; -tc * 2
474 paddw m9, m9
475
476 and r14, 5; 0b101
477 mov r11, r14; strong mask
478 shr r14, 2;
479 movd m12, r14d; store to xmm for mask generation
480 shl r14, 1
481 and r11, 1
482 movd m10, r11d; store to xmm for mask generation
483 or r14, r11; final strong mask, bits 1 and 0
484 jz .weakfilter
485
486 shufps m10, m12, 0
487 pcmpeqd m10, [pd_1]; strong mask
488
489 mova m13, [pw_4]; 4 in every cell
490 pand m11, m10; combine filtering mask and strong mask
491 paddw m12, m2, m3; p1 + p0
492 paddw m12, m4; p1 + p0 + q0
493 mova m10, m12; copy
494 paddw m12, m12; 2*p1 + 2*p0 + 2*q0
495 paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0
496 paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1
497 paddw m12, m13; p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
498 psraw m12, 3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
499 psubw m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
500 pmaxsw m12, m14
501 pminsw m12, m9; av_clip( , -2 * tc, 2 * tc)
502 paddw m12, m3; p0'
503
504 paddw m15, m1, m10; p2 + p1 + p0 + q0
505 psrlw m13, 1; 2 in every cell
506 paddw m15, m13; p2 + p1 + p0 + q0 + 2
507 psraw m15, 2; (p2 + p1 + p0 + q0 + 2) >> 2
508 psubw m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
509 pmaxsw m15, m14
510 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
511 paddw m15, m2; p1'
512
513 paddw m8, m1, m0; p3 + p2
514 paddw m8, m8; 2*p3 + 2*p2
515 paddw m8, m1; 2*p3 + 3*p2
516 paddw m8, m10; 2*p3 + 3*p2 + p1 + p0 + q0
517 paddw m13, m13
518 paddw m8, m13; 2*p3 + 3*p2 + p1 + p0 + q0 + 4
519 psraw m8, 3; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
520 psubw m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
521 pmaxsw m8, m14
522 pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
523 paddw m8, m1; p2'
524 MASKED_COPY m1, m8
525
526 paddw m8, m3, m4; p0 + q0
527 paddw m8, m5; p0 + q0 + q1
528 paddw m8, m8; 2*p0 + 2*q0 + 2*q1
529 paddw m8, m2; p1 + 2*p0 + 2*q0 + 2*q1
530 paddw m8, m6; p1 + 2*p0 + 2*q0 + 2*q1 + q2
531 paddw m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
532 psraw m8, 3; (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
533 psubw m8, m4;
534 pmaxsw m8, m14
535 pminsw m8, m9; av_clip( , -2 * tc, 2 * tc)
536 paddw m8, m4; q0'
537 MASKED_COPY m2, m15
538
539 paddw m15, m3, m4; p0 + q0
540 paddw m15, m5; p0 + q0 + q1
541 mova m10, m15;
542 paddw m15, m6; p0 + q0 + q1 + q2
543 psrlw m13, 1; 2 in every cell
544 paddw m15, m13; p0 + q0 + q1 + q2 + 2
545 psraw m15, 2; (p0 + q0 + q1 + q2 + 2) >> 2
546 psubw m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
547 pmaxsw m15, m14
548 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
549 paddw m15, m5; q1'
550
551 paddw m13, m7; q3 + 2
552 paddw m13, m6; q3 + q2 + 2
553 paddw m13, m13; 2*q3 + 2*q2 + 4
554 paddw m13, m6; 2*q3 + 3*q2 + 4
555 paddw m13, m10; 2*q3 + 3*q2 + q1 + q0 + p0 + 4
556 psraw m13, 3; (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
557 psubw m13, m6; ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
558 pmaxsw m13, m14
559 pminsw m13, m9; av_clip( , -2 * tc, 2 * tc)
560 paddw m13, m6; q2'
561
562 MASKED_COPY m6, m13
563 MASKED_COPY m5, m15
564 MASKED_COPY m4, m8
565 MASKED_COPY m3, m12
566
567 .weakfilter:
568 not r14; strong mask -> weak mask
569 and r14, r13; final weak filtering mask, bits 0 and 1
570 jz .store
571
572 ; weak filtering mask
573 mov r11, r14
574 shr r11, 1
575 movd m12, r11d
576 and r14, 1
577 movd m11, r14d
578 shufps m11, m12, 0
579 pcmpeqd m11, [pd_1]; filtering mask
580
581 mov r13, betaq
582 shr r13, 1;
583 add betaq, r13
584 shr betaq, 3; ((beta + (beta >> 1)) >> 3))
585
586 mova m13, [pw_8]
587 psubw m12, m4, m3 ; q0 - p0
588 psllw m10, m12, 3; 8 * (q0 - p0)
589 paddw m12, m10 ; 9 * (q0 - p0)
590
591 psubw m10, m5, m2 ; q1 - p1
592 psllw m8, m10, 1; 2 * ( q1 - p1 )
593 paddw m10, m8; 3 * ( q1 - p1 )
594 psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
595 paddw m12, m13; + 8
596 psraw m12, 4; >> 4 , delta0
597 PABSW m13, m12; abs(delta0)
598
599
600 psllw m10, m9, 2; 8 * tc
601 paddw m10, m9; 10 * tc
602 pcmpgtw m10, m13
603 pand m11, m10
604
605 psraw m9, 1; tc * 2 -> tc
606 psraw m14, 1; -tc * 2 -> -tc
607
608 pmaxsw m12, m14
609 pminsw m12, m9; av_clip(delta0, -tc, tc)
610
611 psraw m9, 1; tc -> tc / 2
612 pmullw m14, m9, [pw_m1]; -tc / 2
613
614 pavgw m15, m1, m3; (p2 + p0 + 1) >> 1
615 psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1
616 paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
617 psraw m15, 1; (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
618 pmaxsw m15, m14
619 pminsw m15, m9; av_clip(deltap1, -tc/2, tc/2)
620 paddw m15, m2; p1'
621
622 ;beta calculations
623 movd m10, betad
624 SPLATW m10, m10, 0
625
626 movd m13, r7d; 1dp0 + 1dp3
627 movd m8, r8d; 0dp0 + 0dp3
628 punpcklwd m8, m8
629 punpcklwd m13, m13
630 shufps m13, m8, 0;
631 pcmpgtw m8, m10, m13
632 pand m8, m11
633 ;end beta calculations
634 MASKED_COPY2 m2, m15, m8; write p1'
635
636 pavgw m8, m6, m4; (q2 + q0 + 1) >> 1
637 psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1
638 psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
639 psraw m8, 1; ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
640 pmaxsw m8, m14
641 pminsw m8, m9; av_clip(deltaq1, -tc/2, tc/2)
642 paddw m8, m5; q1'
643
644 movd m13, r9d;
645 movd m15, r10d;
646 punpcklwd m15, m15
647 punpcklwd m13, m13
648 shufps m13, m15, 0; dq0 + dq3
649
650 pcmpgtw m10, m13; compare to ((beta+(beta>>1))>>3)
651 pand m10, m11
652 MASKED_COPY2 m5, m8, m10; write q1'
653
654 paddw m15, m3, m12 ; p0 + delta0
655 MASKED_COPY m3, m15
656
657 psubw m8, m4, m12 ; q0 - delta0
658 MASKED_COPY m4, m8
659 %endmacro
660
661 INIT_XMM sse2
662 ;-----------------------------------------------------------------------------
663 ; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
664 ; uint8_t *_no_p, uint8_t *_no_q);
665 ;-----------------------------------------------------------------------------
666 cglobal hevc_v_loop_filter_chroma_8, 3, 6, 8
667 sub r0, 2
668 lea r5, [3 * r1]
669 mov r4, r0
670 add r0, r5
671 TRANSPOSE4x8B_LOAD PASS8ROWS(r4, r0, r1, r5)
672 CHROMA_DEBLOCK_BODY 8
673 TRANSPOSE8x4B_STORE PASS8ROWS(r4, r0, r1, r5)
674 RET
675
676 cglobal hevc_v_loop_filter_chroma_10, 3, 6, 8
677 sub r0, 4
678 lea r5, [3 * r1]
679 mov r4, r0
680 add r0, r5
681 TRANSPOSE4x8W_LOAD PASS8ROWS(r4, r0, r1, r5)
682 CHROMA_DEBLOCK_BODY 10
683 TRANSPOSE8x4W_STORE PASS8ROWS(r4, r0, r1, r5)
684 RET
685
686 ;-----------------------------------------------------------------------------
687 ; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int *_tc,
688 ; uint8_t *_no_p, uint8_t *_no_q);
689 ;-----------------------------------------------------------------------------
690 cglobal hevc_h_loop_filter_chroma_8, 3, 6, 8
691 mov r5, r0; pix
692 sub r5, r1
693 sub r5, r1
694 movh m0, [r5]; p1
695 movh m1, [r5 + r1]; p0
696 movh m2, [r0]; q0
697 movh m3, [r0 + r1]; q1
698 pxor m5, m5; zeros reg
699 punpcklbw m0, m5
700 punpcklbw m1, m5
701 punpcklbw m2, m5
702 punpcklbw m3, m5
703 CHROMA_DEBLOCK_BODY 8
704 packuswb m1, m2
705 movh [r5 + r1], m1
706 movhps [r0], m1
707 RET
708
709 cglobal hevc_h_loop_filter_chroma_10, 3, 6, 8
710 mov r5, r0; pix
711 sub r5, r1
712 sub r5, r1
713 movdqu m0, [r5]; p1
714 movdqu m1, [r5+r1]; p0
715 movdqu m2, [r0]; q0
716 movdqu m3, [r0 + r1]; q1
717 CHROMA_DEBLOCK_BODY 10
718 pxor m5, m5; zeros reg
719 CLIPW m1, m5, [pw_pixel_max]
720 CLIPW m2, m5, [pw_pixel_max]
721 movdqu [r5 + r1], m1
722 movdqu [r0], m2
723 RET
724
725 %if ARCH_X86_64
726 INIT_XMM ssse3
727 ;-----------------------------------------------------------------------------
728 ; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
729 ; int *_tc, uint8_t *_no_p, uint8_t *_no_q);
730 ;-----------------------------------------------------------------------------
731 cglobal hevc_v_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc
732 sub r0, 4
733 lea r5, [3 * r1]
734 mov r6, r0
735 add r0, r5
736 TRANSPOSE8x8B_LOAD PASS8ROWS(r6, r0, r1, r5)
737 LUMA_DEBLOCK_BODY 8, v
738 .store:
739 TRANSPOSE8x8B_STORE PASS8ROWS(r6, r0, r1, r5)
740 .bypassluma:
741 RET
742
743 cglobal hevc_v_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc
744 sub pixq, 8
745 lea r5, [3 * strideq]
746 mov r6, pixq
747 add pixq, r5
748 TRANSPOSE8x8W_LOAD PASS8ROWS(r6, pixq, strideq, r5)
749 LUMA_DEBLOCK_BODY 10, v
750 .store:
751 TRANSPOSE8x8W_STORE PASS8ROWS(r6, r0, r1, r5)
752 .bypassluma:
753 RET
754
755 ;-----------------------------------------------------------------------------
756 ; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
757 ; int *_tc, uint8_t *_no_p, uint8_t *_no_q);
758 ;-----------------------------------------------------------------------------
759 cglobal hevc_h_loop_filter_luma_8, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
760 lea src3strideq, [3 * strideq]
761 mov pix0q, pixq
762 sub pix0q, src3strideq
763 sub pix0q, strideq
764 movdqu m0, [pix0q]; p3
765 movdqu m1, [pix0q + strideq]; p2
766 movdqu m2, [pix0q + 2 * strideq]; p1
767 movdqu m3, [pix0q + src3strideq]; p0
768 movdqu m4, [pixq]; q0
769 movdqu m5, [pixq + strideq]; q1
770 movdqu m6, [pixq + 2 * strideq]; q2
771 movdqu m7, [pixq + src3strideq]; q3
772 pxor m8, m8
773 punpcklbw m0, m8
774 punpcklbw m1, m8
775 punpcklbw m2, m8
776 punpcklbw m3, m8
777 punpcklbw m4, m8
778 punpcklbw m5, m8
779 punpcklbw m6, m8
780 punpcklbw m7, m8
781 LUMA_DEBLOCK_BODY 8, h
782 .store:
783 packuswb m1, m2
784 packuswb m3, m4
785 packuswb m5, m6
786 movh [r5 + r1], m1
787 movhps [r5 + 2 * r1], m1
788 movh [r5 + r6], m3
789 movhps [r0 ], m3
790 movh [r0 + r1], m5
791 movhps [r0 + 2 * r1], m5
792 .bypassluma:
793 RET
794
795 cglobal hevc_h_loop_filter_luma_10, 4, 15, 16, pix, stride, beta, tc, count, pix0, src3stride
796 lea src3strideq, [3 * strideq]
797 mov pix0q, pixq
798 sub pix0q, src3strideq
799 sub pix0q, strideq
800 movdqu m0, [pix0q]; p3
801 movdqu m1, [pix0q + strideq]; p2
802 movdqu m2, [pix0q + 2 * strideq]; p1
803 movdqu m3, [pix0q + src3strideq]; p0
804 movdqu m4, [pixq]; q0
805 movdqu m5, [pixq + strideq]; q1
806 movdqu m6, [pixq + 2 * strideq]; q2
807 movdqu m7, [pixq + src3strideq]; q3
808 LUMA_DEBLOCK_BODY 10, h
809 .store:
810 pxor m8, m8; zeros reg
811 CLIPW m1, m8, [pw_pixel_max]
812 CLIPW m2, m8, [pw_pixel_max]
813 CLIPW m3, m8, [pw_pixel_max]
814 CLIPW m4, m8, [pw_pixel_max]
815 CLIPW m5, m8, [pw_pixel_max]
816 CLIPW m6, m8, [pw_pixel_max]
817 movdqu [pix0q + strideq], m1; p2
818 movdqu [pix0q + 2 * strideq], m2; p1
819 movdqu [pix0q + src3strideq], m3; p0
820 movdqu [pixq ], m4; q0
821 movdqu [pixq + strideq], m5; q1
822 movdqu [pixq + 2 * strideq], m6; q2
823 .bypassluma:
824 RET
825 %endif