x86/synth_filter: add synth_filter_sse
[libav.git] / libavcodec / x86 / dcadsp.asm
CommitLineData
5b59a9fc
CG
1;******************************************************************************
2;* SSE-optimized functions for the DCA decoder
3;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24SECTION_RODATA
25pf_inv16: times 4 dd 0x3D800000 ; 1/16
26
27SECTION_TEXT
28
4cb69642
CG
29; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
30; const int8_t hf_vq[1024][32], intptr_t vq_offset,
31; int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)
32
33%macro DECODE_HF 0
34cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
35 lea srcq, [srcq + offsetq]
36 shl startq, 2
37 mov offsetd, endm
38%define DICT offsetq
39 shl offsetq, 2
40 mov endm, offsetq
41.loop:
42%if ARCH_X86_64
43 mov offsetd, [scaleq + 2 * startq]
44 cvtsi2ss m0, offsetd
45%else
46 cvtsi2ss m0, [scaleq + 2 * startq]
47%endif
48 mov offsetd, [numq + startq]
5b59a9fc 49 mulss m0, [pf_inv16]
4cb69642 50 shl DICT, 5
5b59a9fc
CG
51 shufps m0, m0, 0
52%if cpuflag(sse2)
53%if cpuflag(sse4)
4cb69642
CG
54 pmovsxbd m1, [srcq + DICT + 0]
55 pmovsxbd m2, [srcq + DICT + 4]
5b59a9fc 56%else
4cb69642 57 movq m1, [srcq + DICT]
5b59a9fc
CG
58 punpcklbw m1, m1
59 mova m2, m1
60 punpcklwd m1, m1
61 punpckhwd m2, m2
62 psrad m1, 24
63 psrad m2, 24
64%endif
65 cvtdq2ps m1, m1
66 cvtdq2ps m2, m2
67%else
4cb69642
CG
68 movd mm0, [srcq + DICT + 0]
69 movd mm1, [srcq + DICT + 4]
5b59a9fc
CG
70 punpcklbw mm0, mm0
71 punpcklbw mm1, mm1
72 movq mm2, mm0
73 movq mm3, mm1
74 punpcklwd mm0, mm0
75 punpcklwd mm1, mm1
76 punpckhwd mm2, mm2
77 punpckhwd mm3, mm3
78 psrad mm0, 24
79 psrad mm1, 24
80 psrad mm2, 24
81 psrad mm3, 24
82 cvtpi2ps m1, mm0
83 cvtpi2ps m2, mm1
84 cvtpi2ps m3, mm2
85 cvtpi2ps m4, mm3
86 shufps m0, m0, 0
5b59a9fc
CG
87 shufps m1, m3, q1010
88 shufps m2, m4, q1010
89%endif
90 mulps m1, m0
91 mulps m2, m0
4cb69642
CG
92 mova [dstq + 8 * startq + 0], m1
93 mova [dstq + 8 * startq + 16], m2
94 add startq, 4
95 cmp startq, endm
96 jl .loop
97.end:
98%if notcpuflag(sse2)
99 emms
100%endif
5b59a9fc
CG
101 REP_RET
102%endmacro
103
104%if ARCH_X86_32
105INIT_XMM sse
4cb69642 106DECODE_HF
5b59a9fc
CG
107%endif
108
109INIT_XMM sse2
4cb69642 110DECODE_HF
5b59a9fc
CG
111
112INIT_XMM sse4
4cb69642 113DECODE_HF
ad507d79
CG
114
115; %1=v0/v1 %2=in1 %3=in2
116%macro FIR_LOOP 2-3
117.loop%1:
118%define va m1
119%define vb m2
120%if %1
121%define OFFSET 0
122%else
123%define OFFSET NUM_COEF*count
124%endif
125; for v0, incrementing and for v1, decrementing
126 mova va, [cf0q + OFFSET]
127 mova vb, [cf0q + OFFSET + 4*NUM_COEF]
128%if %0 == 3
129 mova m4, [cf0q + OFFSET + mmsize]
130 mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
131%endif
132 mulps va, %2
133 mulps vb, %2
134%if %0 == 3
135 mulps m4, %3
136 mulps m0, %3
137 addps va, m4
138 addps vb, m0
139%endif
140 ; va = va1 va2 va3 va4
141 ; vb = vb1 vb2 vb3 vb4
142%if %1
143 SWAP va, vb
144%endif
145 mova m4, va
146 unpcklps va, vb ; va3 vb3 va4 vb4
147 unpckhps m4, vb ; va1 vb1 va2 vb2
148 addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
149 movhlps vb, m4 ; va1+3 vb1+3
150 addps vb, m4 ; va0..4 vb0..4
151 movh [outq + count], vb
152%if %1
153 sub cf0q, 8*NUM_COEF
154%endif
155 add count, 8
156 jl .loop%1
157%endmacro
158
159; void dca_lfe_fir(float *out, float *in, float *coefs)
160%macro DCA_LFE_FIR 1
161cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
162%define IN1 m3
163%define IN2 m5
164%define count inq
165%define NUM_COEF 4*(2-%1)
166%define NUM_OUT 32*(%1+1)
167
168 movu IN1, [inq + 4 - 1*mmsize]
169 shufps IN1, IN1, q0123
170%if %1 == 0
171 movu IN2, [inq + 4 - 2*mmsize]
172 shufps IN2, IN2, q0123
173%endif
174
175 mov count, -4*NUM_OUT
176 add cf0q, 4*NUM_COEF*NUM_OUT
177 add outq, 4*NUM_OUT
178 ; compute v0 first
179%if %1 == 0
180 FIR_LOOP 0, IN1, IN2
181%else
182 FIR_LOOP 0, IN1
183%endif
184 shufps IN1, IN1, q0123
185 mov count, -4*NUM_OUT
186 ; cf1 already correctly positioned
187 add outq, 4*NUM_OUT ; outq now at out2
188 sub cf0q, 8*NUM_COEF
189%if %1 == 0
190 shufps IN2, IN2, q0123
191 FIR_LOOP 1, IN2, IN1
192%else
193 FIR_LOOP 1, IN1
194%endif
195 RET
196%endmacro
197
198INIT_XMM sse
199DCA_LFE_FIR 0
200DCA_LFE_FIR 1
08e3ea60 201
2025d802
JA
202%macro SETZERO 1
203%if cpuflag(sse2)
204 pxor %1, %1
205%else
206 xorps %1, %1, %1
207%endif
208%endmacro
209
210%macro SHUF 2
211%if cpuflag(sse2)
212 pshufd %1, %2, q0123
213%else
214 mova %1, %2
215 shufps %1, %1, q0123
216%endif
217%endmacro
218
08e3ea60
CG
219%macro INNER_LOOP 1
220 ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
221 ;~ a += window[i + j] * (-synth_buf[15 - i + j])
222 ;~ b += window[i + j + 16] * (synth_buf[i + j])
2025d802 223 SHUF m5, [ptr2 + j + (15 - 3) * 4]
08e3ea60
CG
224 mova m6, [ptr1 + j]
225%if ARCH_X86_64
2025d802 226 SHUF m11, [ptr2 + j + (15 - 3) * 4 - mmsize]
08e3ea60
CG
227 mova m12, [ptr1 + j + mmsize]
228%endif
229 mulps m6, [win + %1 + j + 16 * 4]
230 mulps m5, [win + %1 + j]
231%if ARCH_X86_64
232 mulps m12, [win + %1 + j + mmsize + 16 * 4]
233 mulps m11, [win + %1 + j + mmsize]
234%endif
235 addps m2, m6
236 subps m1, m5
237%if ARCH_X86_64
238 addps m8, m12
239 subps m7, m11
240%endif
241 ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
242 ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
2025d802 243 SHUF m6, [ptr2 + j + (31 - 3) * 4]
08e3ea60
CG
244 mova m5, [ptr1 + j + 16 * 4]
245%if ARCH_X86_64
2025d802 246 SHUF m12, [ptr2 + j + (31 - 3) * 4 - mmsize]
08e3ea60
CG
247 mova m11, [ptr1 + j + mmsize + 16 * 4]
248%endif
249 mulps m5, [win + %1 + j + 32 * 4]
250 mulps m6, [win + %1 + j + 48 * 4]
251%if ARCH_X86_64
252 mulps m11, [win + %1 + j + mmsize + 32 * 4]
253 mulps m12, [win + %1 + j + mmsize + 48 * 4]
254%endif
255 addps m3, m5
256 addps m4, m6
257%if ARCH_X86_64
258 addps m9, m11
259 addps m10, m12
260%endif
261 sub j, 64 * 4
262%endmacro
263
2025d802
JA
264; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
265; const float window[512], float out[32],
266; intptr_t offset, float scale)
267%macro SYNTH_FILTER 0
08e3ea60
CG
268cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
269 synth_buf, synth_buf2, window, out, off, scale
270%define scale m0
271%if ARCH_X86_32 || WIN64
2025d802 272%if cpuflag(sse2)
08e3ea60 273 movd scale, scalem
2025d802
JA
274%else
275 movss scale, scalem
276%endif
08e3ea60
CG
277; Make sure offset is in a register and not on the stack
278%define OFFQ r4q
279%else
280%define OFFQ offq
281%endif
2025d802 282 SPLATD m0
08e3ea60
CG
283 ; prepare inner counter limit 1
284 mov r5q, 480
285 sub r5q, offmp
286 and r5q, -64
287 shl r5q, 2
288 mov OFFQ, r5q
289%define i r5q
290 mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
291
292%define buf2 synth_buf2q
293%if ARCH_X86_32
294 mov buf2, synth_buf2mp
295%endif
296.mainloop
297 ; m1 = a m2 = b m3 = c m4 = d
2025d802
JA
298 SETZERO m3
299 SETZERO m4
08e3ea60
CG
300 mova m1, [buf2 + i]
301 mova m2, [buf2 + i + 16 * 4]
302%if ARCH_X86_32
303%define ptr1 r0q
304%define ptr2 r1q
305%define win r2q
306%define j r3q
307 mov win, windowm
308 mov ptr1, synth_bufm
309 add win, i
310 add ptr1, i
311%else ; ARCH_X86_64
312%define ptr1 r6q
313%define ptr2 r7q ; must be loaded
314%define win r8q
315%define j r9q
2025d802
JA
316 SETZERO m9
317 SETZERO m10
08e3ea60
CG
318 mova m7, [buf2 + i + mmsize]
319 mova m8, [buf2 + i + mmsize + 16 * 4]
320 lea win, [windowq + i]
321 lea ptr1, [synth_bufq + i]
322%endif
323 mov ptr2, synth_bufmp
324 ; prepare the inner loop counter
325 mov j, OFFQ
326 sub ptr2, i
327.loop1:
328 INNER_LOOP 0
329 jge .loop1
330
331 mov j, 448 * 4
332 sub j, OFFQ
333 jz .end
334 sub ptr1, j
335 sub ptr2, j
336 add win, OFFQ ; now at j-64, so define OFFSET
337 sub j, 64 * 4
338.loop2:
339 INNER_LOOP 64 * 4
340 jge .loop2
341
342.end:
343%if ARCH_X86_32
344 mov buf2, synth_buf2m ; needed for next iteration anyway
345 mov outq, outmp ; j, which will be set again during it
346%endif
347 ;~ out[i] = a * scale;
348 ;~ out[i + 16] = b * scale;
349 mulps m1, scale
350 mulps m2, scale
351%if ARCH_X86_64
352 mulps m7, scale
353 mulps m8, scale
354%endif
355 ;~ synth_buf2[i] = c;
356 ;~ synth_buf2[i + 16] = d;
357 mova [buf2 + i + 0 * 4], m3
358 mova [buf2 + i + 16 * 4], m4
359%if ARCH_X86_64
360 mova [buf2 + i + 0 * 4 + mmsize], m9
361 mova [buf2 + i + 16 * 4 + mmsize], m10
362%endif
363 ;~ out[i] = a;
364 ;~ out[i + 16] = a;
365 mova [outq + i + 0 * 4], m1
366 mova [outq + i + 16 * 4], m2
367%if ARCH_X86_64
368 mova [outq + i + 0 * 4 + mmsize], m7
369 mova [outq + i + 16 * 4 + mmsize], m8
370%endif
371 sub i, (ARCH_X86_64 + 1) * mmsize
372 jge .mainloop
373 RET
2025d802
JA
374%endmacro
375
376%if ARCH_X86_32
377INIT_XMM sse
378SYNTH_FILTER
379%endif
380INIT_XMM sse2
381SYNTH_FILTER