x86: dcadsp: implement int8x8_fmul_int32
[libav.git] / libavcodec / x86 / dcadsp.asm
1 ;******************************************************************************
2 ;* SSE-optimized functions for the DCA decoder
3 ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
4 ;*
5 ;* This file is part of Libav.
6 ;*
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
11 ;*
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
21
22 %include "libavutil/x86/x86util.asm"
23
24 SECTION_RODATA
25 pf_inv16: times 4 dd 0x3D800000 ; 1/16
26
27 SECTION_TEXT
28
29 ; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale)
30 %macro INT8X8_FMUL_INT32 0
31 cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
32 cvtsi2ss m0, scalem
33 mulss m0, [pf_inv16]
34 shufps m0, m0, 0
35 %if cpuflag(sse2)
36 %if cpuflag(sse4)
37 pmovsxbd m1, [srcq+0]
38 pmovsxbd m2, [srcq+4]
39 %else
40 movq m1, [srcq]
41 punpcklbw m1, m1
42 mova m2, m1
43 punpcklwd m1, m1
44 punpckhwd m2, m2
45 psrad m1, 24
46 psrad m2, 24
47 %endif
48 cvtdq2ps m1, m1
49 cvtdq2ps m2, m2
50 %else
51 movd mm0, [srcq+0]
52 movd mm1, [srcq+4]
53 punpcklbw mm0, mm0
54 punpcklbw mm1, mm1
55 movq mm2, mm0
56 movq mm3, mm1
57 punpcklwd mm0, mm0
58 punpcklwd mm1, mm1
59 punpckhwd mm2, mm2
60 punpckhwd mm3, mm3
61 psrad mm0, 24
62 psrad mm1, 24
63 psrad mm2, 24
64 psrad mm3, 24
65 cvtpi2ps m1, mm0
66 cvtpi2ps m2, mm1
67 cvtpi2ps m3, mm2
68 cvtpi2ps m4, mm3
69 shufps m0, m0, 0
70 emms
71 shufps m1, m3, q1010
72 shufps m2, m4, q1010
73 %endif
74 mulps m1, m0
75 mulps m2, m0
76 mova [dstq+ 0], m1
77 mova [dstq+16], m2
78 REP_RET
79 %endmacro
80
81 %if ARCH_X86_32
82 INIT_XMM sse
83 INT8X8_FMUL_INT32
84 %endif
85
86 INIT_XMM sse2
87 INT8X8_FMUL_INT32
88
89 INIT_XMM sse4
90 INT8X8_FMUL_INT32