4:4:4 H.264 decoding support
[libav.git] / libavcodec / x86 / h264_idct_10bit.asm
1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
5 ;*
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7 ;*
8 ;* This file is part of Libav.
9 ;*
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
14 ;*
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
19 ;*
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
24
25 %include "x86inc.asm"
26 %include "x86util.asm"
27
28 SECTION_RODATA
29
30 pw_pixel_max: times 8 dw ((1 << 10)-1)
31 pd_32: times 4 dd 32
32 scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
33 db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
34 db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
35 db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
36 db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
37 db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
38 db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
39 db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
40 db 4+11*8, 5+11*8, 4+12*8, 5+12*8
41 db 6+11*8, 7+11*8, 6+12*8, 7+12*8
42 db 4+13*8, 5+13*8, 4+14*8, 5+14*8
43 db 6+13*8, 7+13*8, 6+14*8, 7+14*8
44
45 %ifdef PIC
46 %define scan8 r11
47 %else
48 %define scan8 scan8_mem
49 %endif
50
51 SECTION .text
52
53 ;-----------------------------------------------------------------------------
54 ; void h264_idct_add(pixel *dst, dctcoef *block, int stride)
55 ;-----------------------------------------------------------------------------
56 %macro STORE_DIFFx2 6
57 psrad %1, 6
58 psrad %2, 6
59 packssdw %1, %2
60 movq %3, [%5]
61 movhps %3, [%5+%6]
62 paddsw %1, %3
63 CLIPW %1, %4, [pw_pixel_max]
64 movq [%5], %1
65 movhps [%5+%6], %1
66 %endmacro
67
68 %macro STORE_DIFF16 5
69 psrad %1, 6
70 psrad %2, 6
71 packssdw %1, %2
72 paddsw %1, [%5]
73 CLIPW %1, %3, %4
74 mova [%5], %1
75 %endmacro
76
77 ;dst, in, stride
78 %macro IDCT4_ADD_10 3
79 mova m0, [%2+ 0]
80 mova m1, [%2+16]
81 mova m2, [%2+32]
82 mova m3, [%2+48]
83 IDCT4_1D d,0,1,2,3,4,5
84 TRANSPOSE4x4D 0,1,2,3,4
85 paddd m0, [pd_32]
86 IDCT4_1D d,0,1,2,3,4,5
87 pxor m5, m5
88 STORE_DIFFx2 m0, m1, m4, m5, %1, %3
89 lea %1, [%1+%3*2]
90 STORE_DIFFx2 m2, m3, m4, m5, %1, %3
91 %endmacro
92
93 %macro IDCT_ADD_10 1
94 cglobal h264_idct_add_10_%1, 3,3
95 IDCT4_ADD_10 r0, r1, r2
96 RET
97 %endmacro
98
99 INIT_XMM
100 IDCT_ADD_10 sse2
101 %ifdef HAVE_AVX
102 INIT_AVX
103 IDCT_ADD_10 avx
104 %endif
105
106 ;-----------------------------------------------------------------------------
107 ; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
108 ;-----------------------------------------------------------------------------
109 ;;;;;;; NO FATE SAMPLES TRIGGER THIS
110 %macro ADD4x4IDCT 1
111 add4x4_idct_%1:
112 add r5, r0
113 mova m0, [r2+ 0]
114 mova m1, [r2+16]
115 mova m2, [r2+32]
116 mova m3, [r2+48]
117 IDCT4_1D d,0,1,2,3,4,5
118 TRANSPOSE4x4D 0,1,2,3,4
119 paddd m0, [pd_32]
120 IDCT4_1D d,0,1,2,3,4,5
121 pxor m5, m5
122 STORE_DIFFx2 m0, m1, m4, m5, r5, r3
123 lea r5, [r5+r3*2]
124 STORE_DIFFx2 m2, m3, m4, m5, r5, r3
125 ret
126 %endmacro
127
128 INIT_XMM
129 ALIGN 16
130 ADD4x4IDCT sse2
131 %ifdef HAVE_AVX
132 INIT_AVX
133 ALIGN 16
134 ADD4x4IDCT avx
135 %endif
136
137 %macro ADD16_OP 3
138 cmp byte [r4+%3], 0
139 jz .skipblock%2
140 mov r5d, [r1+%2*4]
141 call add4x4_idct_%1
142 .skipblock%2:
143 %if %2<15
144 add r2, 64
145 %endif
146 %endmacro
147
148 %macro IDCT_ADD16_10 1
149 cglobal h264_idct_add16_10_%1, 5,6
150 ADD16_OP %1, 0, 4+1*8
151 ADD16_OP %1, 1, 5+1*8
152 ADD16_OP %1, 2, 4+2*8
153 ADD16_OP %1, 3, 5+2*8
154 ADD16_OP %1, 4, 6+1*8
155 ADD16_OP %1, 5, 7+1*8
156 ADD16_OP %1, 6, 6+2*8
157 ADD16_OP %1, 7, 7+2*8
158 ADD16_OP %1, 8, 4+3*8
159 ADD16_OP %1, 9, 5+3*8
160 ADD16_OP %1, 10, 4+4*8
161 ADD16_OP %1, 11, 5+4*8
162 ADD16_OP %1, 12, 6+3*8
163 ADD16_OP %1, 13, 7+3*8
164 ADD16_OP %1, 14, 6+4*8
165 ADD16_OP %1, 15, 7+4*8
166 REP_RET
167 %endmacro
168
169 INIT_XMM
170 IDCT_ADD16_10 sse2
171 %ifdef HAVE_AVX
172 INIT_AVX
173 IDCT_ADD16_10 avx
174 %endif
175
176 ;-----------------------------------------------------------------------------
177 ; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
178 ;-----------------------------------------------------------------------------
179 %macro IDCT_DC_ADD_OP_10 3
180 pxor m5, m5
181 %if avx_enabled
182 paddw m1, m0, [%1+0 ]
183 paddw m2, m0, [%1+%2 ]
184 paddw m3, m0, [%1+%2*2]
185 paddw m4, m0, [%1+%3 ]
186 %else
187 mova m1, [%1+0 ]
188 mova m2, [%1+%2 ]
189 mova m3, [%1+%2*2]
190 mova m4, [%1+%3 ]
191 paddw m1, m0
192 paddw m2, m0
193 paddw m3, m0
194 paddw m4, m0
195 %endif
196 CLIPW m1, m5, m6
197 CLIPW m2, m5, m6
198 CLIPW m3, m5, m6
199 CLIPW m4, m5, m6
200 mova [%1+0 ], m1
201 mova [%1+%2 ], m2
202 mova [%1+%2*2], m3
203 mova [%1+%3 ], m4
204 %endmacro
205
206 INIT_MMX
207 cglobal h264_idct_dc_add_10_mmx2,3,3
208 movd m0, [r1]
209 paddd m0, [pd_32]
210 psrad m0, 6
211 lea r1, [r2*3]
212 pshufw m0, m0, 0
213 mova m6, [pw_pixel_max]
214 IDCT_DC_ADD_OP_10 r0, r2, r1
215 RET
216
217 ;-----------------------------------------------------------------------------
218 ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
219 ;-----------------------------------------------------------------------------
220 %macro IDCT8_DC_ADD 1
221 cglobal h264_idct8_dc_add_10_%1,3,3,7
222 mov r1d, [r1]
223 add r1, 32
224 sar r1, 6
225 movd m0, r1d
226 lea r1, [r2*3]
227 SPLATW m0, m0, 0
228 mova m6, [pw_pixel_max]
229 IDCT_DC_ADD_OP_10 r0, r2, r1
230 lea r0, [r0+r2*4]
231 IDCT_DC_ADD_OP_10 r0, r2, r1
232 RET
233 %endmacro
234
235 INIT_XMM
236 IDCT8_DC_ADD sse2
237 %ifdef HAVE_AVX
238 INIT_AVX
239 IDCT8_DC_ADD avx
240 %endif
241
242 ;-----------------------------------------------------------------------------
243 ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
244 ;-----------------------------------------------------------------------------
245 %macro AC 2
246 .ac%2
247 mov r5d, [r1+(%2+0)*4]
248 call add4x4_idct_%1
249 mov r5d, [r1+(%2+1)*4]
250 add r2, 64
251 call add4x4_idct_%1
252 add r2, 64
253 jmp .skipadd%2
254 %endmacro
255
256 %assign last_block 16
257 %macro ADD16_OP_INTRA 3
258 cmp word [r4+%3], 0
259 jnz .ac%2
260 mov r5d, [r2+ 0]
261 or r5d, [r2+64]
262 jz .skipblock%2
263 mov r5d, [r1+(%2+0)*4]
264 call idct_dc_add_%1
265 .skipblock%2:
266 %if %2<last_block-2
267 add r2, 128
268 %endif
269 .skipadd%2:
270 %endmacro
271
272 %macro IDCT_ADD16INTRA_10 1
273 idct_dc_add_%1:
274 add r5, r0
275 movq m0, [r2+ 0]
276 movhps m0, [r2+64]
277 paddd m0, [pd_32]
278 psrad m0, 6
279 pshufhw m0, m0, 0
280 pshuflw m0, m0, 0
281 lea r6, [r3*3]
282 mova m6, [pw_pixel_max]
283 IDCT_DC_ADD_OP_10 r5, r3, r6
284 ret
285
286 cglobal h264_idct_add16intra_10_%1,5,7,8
287 ADD16_OP_INTRA %1, 0, 4+1*8
288 ADD16_OP_INTRA %1, 2, 4+2*8
289 ADD16_OP_INTRA %1, 4, 6+1*8
290 ADD16_OP_INTRA %1, 6, 6+2*8
291 ADD16_OP_INTRA %1, 8, 4+3*8
292 ADD16_OP_INTRA %1, 10, 4+4*8
293 ADD16_OP_INTRA %1, 12, 6+3*8
294 ADD16_OP_INTRA %1, 14, 6+4*8
295 REP_RET
296 AC %1, 8
297 AC %1, 10
298 AC %1, 12
299 AC %1, 14
300 AC %1, 0
301 AC %1, 2
302 AC %1, 4
303 AC %1, 6
304 %endmacro
305
306 INIT_XMM
307 IDCT_ADD16INTRA_10 sse2
308 %ifdef HAVE_AVX
309 INIT_AVX
310 IDCT_ADD16INTRA_10 avx
311 %endif
312
313 %assign last_block 36
314 ;-----------------------------------------------------------------------------
315 ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
316 ;-----------------------------------------------------------------------------
317 %macro IDCT_ADD8 1
318 cglobal h264_idct_add8_10_%1,5,7
319 %ifdef ARCH_X86_64
320 mov r10, r0
321 %endif
322 add r2, 1024
323 mov r0, [r0]
324 ADD16_OP_INTRA %1, 16, 4+ 6*8
325 ADD16_OP_INTRA %1, 18, 4+ 7*8
326 add r2, 1024-128*2
327 %ifdef ARCH_X86_64
328 mov r0, [r10+gprsize]
329 %else
330 mov r0, r0m
331 mov r0, [r0+gprsize]
332 %endif
333 ADD16_OP_INTRA %1, 32, 4+11*8
334 ADD16_OP_INTRA %1, 34, 4+12*8
335 REP_RET
336 AC %1, 16
337 AC %1, 18
338 AC %1, 32
339 AC %1, 34
340
341 %endmacro ; IDCT_ADD8
342
343 INIT_XMM
344 IDCT_ADD8 sse2
345 %ifdef HAVE_AVX
346 INIT_AVX
347 IDCT_ADD8 avx
348 %endif
349
350 ;-----------------------------------------------------------------------------
351 ; void h264_idct8_add(pixel *dst, dctcoef *block, int stride)
352 ;-----------------------------------------------------------------------------
353 %macro IDCT8_1D 2
354 SWAP 0, 1
355 psrad m4, m5, 1
356 psrad m1, m0, 1
357 paddd m4, m5
358 paddd m1, m0
359 paddd m4, m7
360 paddd m1, m5
361 psubd m4, m0
362 paddd m1, m3
363
364 psubd m0, m3
365 psubd m5, m3
366 paddd m0, m7
367 psubd m5, m7
368 psrad m3, 1
369 psrad m7, 1
370 psubd m0, m3
371 psubd m5, m7
372
373 SWAP 1, 7
374 psrad m1, m7, 2
375 psrad m3, m4, 2
376 paddd m3, m0
377 psrad m0, 2
378 paddd m1, m5
379 psrad m5, 2
380 psubd m0, m4
381 psubd m7, m5
382
383 SWAP 5, 6
384 psrad m4, m2, 1
385 psrad m6, m5, 1
386 psubd m4, m5
387 paddd m6, m2
388
389 mova m2, %1
390 mova m5, %2
391 SUMSUB_BA d, 5, 2
392 SUMSUB_BA d, 6, 5
393 SUMSUB_BA d, 4, 2
394 SUMSUB_BA d, 7, 6
395 SUMSUB_BA d, 0, 4
396 SUMSUB_BA d, 3, 2
397 SUMSUB_BA d, 1, 5
398 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
399 %endmacro
400
401 %macro IDCT8_1D_FULL 1
402 mova m7, [%1+112*2]
403 mova m6, [%1+ 96*2]
404 mova m5, [%1+ 80*2]
405 mova m3, [%1+ 48*2]
406 mova m2, [%1+ 32*2]
407 mova m1, [%1+ 16*2]
408 IDCT8_1D [%1], [%1+ 64*2]
409 %endmacro
410
411 ; %1=int16_t *block, %2=int16_t *dstblock
412 %macro IDCT8_ADD_SSE_START 2
413 IDCT8_1D_FULL %1
414 %ifdef ARCH_X86_64
415 TRANSPOSE4x4D 0,1,2,3,8
416 mova [%2 ], m0
417 TRANSPOSE4x4D 4,5,6,7,8
418 mova [%2+8*2], m4
419 %else
420 mova [%1], m7
421 TRANSPOSE4x4D 0,1,2,3,7
422 mova m7, [%1]
423 mova [%2 ], m0
424 mova [%2+16*2], m1
425 mova [%2+32*2], m2
426 mova [%2+48*2], m3
427 TRANSPOSE4x4D 4,5,6,7,3
428 mova [%2+ 8*2], m4
429 mova [%2+24*2], m5
430 mova [%2+40*2], m6
431 mova [%2+56*2], m7
432 %endif
433 %endmacro
434
435 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
436 %macro IDCT8_ADD_SSE_END 3
437 IDCT8_1D_FULL %2
438 mova [%2 ], m6
439 mova [%2+16*2], m7
440
441 pxor m7, m7
442 STORE_DIFFx2 m0, m1, m6, m7, %1, %3
443 lea %1, [%1+%3*2]
444 STORE_DIFFx2 m2, m3, m6, m7, %1, %3
445 mova m0, [%2 ]
446 mova m1, [%2+16*2]
447 lea %1, [%1+%3*2]
448 STORE_DIFFx2 m4, m5, m6, m7, %1, %3
449 lea %1, [%1+%3*2]
450 STORE_DIFFx2 m0, m1, m6, m7, %1, %3
451 %endmacro
452
453 %macro IDCT8_ADD 1
454 cglobal h264_idct8_add_10_%1, 3,4,16
455 %ifndef UNIX64
456 %assign pad 16-gprsize-(stack_offset&15)
457 sub rsp, pad
458 call h264_idct8_add1_10_%1
459 add rsp, pad
460 RET
461 %endif
462
463 ALIGN 16
464 ; TODO: does not need to use stack
465 h264_idct8_add1_10_%1:
466 %assign pad 256+16-gprsize
467 sub rsp, pad
468 add dword [r1], 32
469
470 %ifdef ARCH_X86_64
471 IDCT8_ADD_SSE_START r1, rsp
472 SWAP 1, 9
473 SWAP 2, 10
474 SWAP 3, 11
475 SWAP 5, 13
476 SWAP 6, 14
477 SWAP 7, 15
478 IDCT8_ADD_SSE_START r1+16, rsp+128
479 PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
480 IDCT8_1D [rsp], [rsp+128]
481 SWAP 0, 8
482 SWAP 1, 9
483 SWAP 2, 10
484 SWAP 3, 11
485 SWAP 4, 12
486 SWAP 5, 13
487 SWAP 6, 14
488 SWAP 7, 15
489 IDCT8_1D [rsp+16], [rsp+144]
490 psrad m8, 6
491 psrad m0, 6
492 packssdw m8, m0
493 paddsw m8, [r0]
494 pxor m0, m0
495 CLIPW m8, m0, [pw_pixel_max]
496 mova [r0], m8
497 mova m8, [pw_pixel_max]
498 STORE_DIFF16 m9, m1, m0, m8, r0+r2
499 lea r0, [r0+r2*2]
500 STORE_DIFF16 m10, m2, m0, m8, r0
501 STORE_DIFF16 m11, m3, m0, m8, r0+r2
502 lea r0, [r0+r2*2]
503 STORE_DIFF16 m12, m4, m0, m8, r0
504 STORE_DIFF16 m13, m5, m0, m8, r0+r2
505 lea r0, [r0+r2*2]
506 STORE_DIFF16 m14, m6, m0, m8, r0
507 STORE_DIFF16 m15, m7, m0, m8, r0+r2
508 %else
509 IDCT8_ADD_SSE_START r1, rsp
510 IDCT8_ADD_SSE_START r1+16, rsp+128
511 lea r3, [r0+8]
512 IDCT8_ADD_SSE_END r0, rsp, r2
513 IDCT8_ADD_SSE_END r3, rsp+16, r2
514 %endif ; ARCH_X86_64
515
516 add rsp, pad
517 ret
518 %endmacro
519
520 INIT_XMM
521 IDCT8_ADD sse2
522 %ifdef HAVE_AVX
523 INIT_AVX
524 IDCT8_ADD avx
525 %endif
526
527 ;-----------------------------------------------------------------------------
528 ; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
529 ;-----------------------------------------------------------------------------
530 ;;;;;;; NO FATE SAMPLES TRIGGER THIS
531 %macro IDCT8_ADD4_OP 3
532 cmp byte [r4+%3], 0
533 jz .skipblock%2
534 mov r0d, [r6+%2*4]
535 add r0, r5
536 call h264_idct8_add1_10_%1
537 .skipblock%2:
538 %if %2<12
539 add r1, 256
540 %endif
541 %endmacro
542
543 %macro IDCT8_ADD4 1
544 cglobal h264_idct8_add4_10_%1, 0,7,16
545 %assign pad 16-gprsize-(stack_offset&15)
546 SUB rsp, pad
547 mov r5, r0mp
548 mov r6, r1mp
549 mov r1, r2mp
550 mov r2d, r3m
551 movifnidn r4, r4mp
552 IDCT8_ADD4_OP %1, 0, 4+1*8
553 IDCT8_ADD4_OP %1, 4, 6+1*8
554 IDCT8_ADD4_OP %1, 8, 4+3*8
555 IDCT8_ADD4_OP %1, 12, 6+3*8
556 ADD rsp, pad
557 RET
558 %endmacro ; IDCT8_ADD4
559
560 INIT_XMM
561 IDCT8_ADD4 sse2
562 %ifdef HAVE_AVX
563 INIT_AVX
564 IDCT8_ADD4 avx
565 %endif