x86: dcadsp: implement SSE lfe_dir
[libav.git] / libavcodec / x86 / dcadsp.asm
1 ;******************************************************************************
2 ;* SSE-optimized functions for the DCA decoder
3 ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
4 ;*
5 ;* This file is part of Libav.
6 ;*
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
11 ;*
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
21
22 %include "libavutil/x86/x86util.asm"
23
24 SECTION_RODATA
25 pf_inv16: times 4 dd 0x3D800000 ; 1/16
26
27 SECTION_TEXT
28
29 ; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale)
30 %macro INT8X8_FMUL_INT32 0
31 cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
32 cvtsi2ss m0, scalem
33 mulss m0, [pf_inv16]
34 shufps m0, m0, 0
35 %if cpuflag(sse2)
36 %if cpuflag(sse4)
37 pmovsxbd m1, [srcq+0]
38 pmovsxbd m2, [srcq+4]
39 %else
40 movq m1, [srcq]
41 punpcklbw m1, m1
42 mova m2, m1
43 punpcklwd m1, m1
44 punpckhwd m2, m2
45 psrad m1, 24
46 psrad m2, 24
47 %endif
48 cvtdq2ps m1, m1
49 cvtdq2ps m2, m2
50 %else
51 movd mm0, [srcq+0]
52 movd mm1, [srcq+4]
53 punpcklbw mm0, mm0
54 punpcklbw mm1, mm1
55 movq mm2, mm0
56 movq mm3, mm1
57 punpcklwd mm0, mm0
58 punpcklwd mm1, mm1
59 punpckhwd mm2, mm2
60 punpckhwd mm3, mm3
61 psrad mm0, 24
62 psrad mm1, 24
63 psrad mm2, 24
64 psrad mm3, 24
65 cvtpi2ps m1, mm0
66 cvtpi2ps m2, mm1
67 cvtpi2ps m3, mm2
68 cvtpi2ps m4, mm3
69 shufps m0, m0, 0
70 emms
71 shufps m1, m3, q1010
72 shufps m2, m4, q1010
73 %endif
74 mulps m1, m0
75 mulps m2, m0
76 mova [dstq+ 0], m1
77 mova [dstq+16], m2
78 REP_RET
79 %endmacro
80
81 %if ARCH_X86_32
82 INIT_XMM sse
83 INT8X8_FMUL_INT32
84 %endif
85
86 INIT_XMM sse2
87 INT8X8_FMUL_INT32
88
89 INIT_XMM sse4
90 INT8X8_FMUL_INT32
91
92 ; %1=v0/v1 %2=in1 %3=in2
93 %macro FIR_LOOP 2-3
94 .loop%1:
95 %define va m1
96 %define vb m2
97 %if %1
98 %define OFFSET 0
99 %else
100 %define OFFSET NUM_COEF*count
101 %endif
102 ; for v0, incrementing and for v1, decrementing
103 mova va, [cf0q + OFFSET]
104 mova vb, [cf0q + OFFSET + 4*NUM_COEF]
105 %if %0 == 3
106 mova m4, [cf0q + OFFSET + mmsize]
107 mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
108 %endif
109 mulps va, %2
110 mulps vb, %2
111 %if %0 == 3
112 mulps m4, %3
113 mulps m0, %3
114 addps va, m4
115 addps vb, m0
116 %endif
117 ; va = va1 va2 va3 va4
118 ; vb = vb1 vb2 vb3 vb4
119 %if %1
120 SWAP va, vb
121 %endif
122 mova m4, va
123 unpcklps va, vb ; va3 vb3 va4 vb4
124 unpckhps m4, vb ; va1 vb1 va2 vb2
125 addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
126 movhlps vb, m4 ; va1+3 vb1+3
127 addps vb, m4 ; va0..4 vb0..4
128 movh [outq + count], vb
129 %if %1
130 sub cf0q, 8*NUM_COEF
131 %endif
132 add count, 8
133 jl .loop%1
134 %endmacro
135
136 ; void dca_lfe_fir(float *out, float *in, float *coefs)
137 %macro DCA_LFE_FIR 1
138 cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
139 %define IN1 m3
140 %define IN2 m5
141 %define count inq
142 %define NUM_COEF 4*(2-%1)
143 %define NUM_OUT 32*(%1+1)
144
145 movu IN1, [inq + 4 - 1*mmsize]
146 shufps IN1, IN1, q0123
147 %if %1 == 0
148 movu IN2, [inq + 4 - 2*mmsize]
149 shufps IN2, IN2, q0123
150 %endif
151
152 mov count, -4*NUM_OUT
153 add cf0q, 4*NUM_COEF*NUM_OUT
154 add outq, 4*NUM_OUT
155 ; compute v0 first
156 %if %1 == 0
157 FIR_LOOP 0, IN1, IN2
158 %else
159 FIR_LOOP 0, IN1
160 %endif
161 shufps IN1, IN1, q0123
162 mov count, -4*NUM_OUT
163 ; cf1 already correctly positioned
164 add outq, 4*NUM_OUT ; outq now at out2
165 sub cf0q, 8*NUM_COEF
166 %if %1 == 0
167 shufps IN2, IN2, q0123
168 FIR_LOOP 1, IN2, IN1
169 %else
170 FIR_LOOP 1, IN1
171 %endif
172 RET
173 %endmacro
174
175 INIT_XMM sse
176 DCA_LFE_FIR 0
177 DCA_LFE_FIR 1