c42ee23fafbbe51d0a4b809699c799e814ec8a60
[libav.git] / libavcodec / x86 / dcadsp.asm
1 ;******************************************************************************
2 ;* SSE-optimized functions for the DCA decoder
3 ;* Copyright (C) 2012-2014 Christophe Gisquet <christophe.gisquet@gmail.com>
4 ;*
5 ;* This file is part of Libav.
6 ;*
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
11 ;*
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
21
22 %include "libavutil/x86/x86util.asm"
23
24 SECTION_RODATA
25 pf_inv16: times 4 dd 0x3D800000 ; 1/16
26
27 SECTION_TEXT
28
29 ; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
30 ; const int8_t hf_vq[1024][32], intptr_t vq_offset,
31 ; int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)
32
33 %macro DECODE_HF 0
34 cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
35 lea srcq, [srcq + offsetq]
36 shl startq, 2
37 mov offsetd, endm
38 %define DICT offsetq
39 shl offsetq, 2
40 mov endm, offsetq
41 .loop:
42 %if ARCH_X86_64
43 mov offsetd, [scaleq + 2 * startq]
44 cvtsi2ss m0, offsetd
45 %else
46 cvtsi2ss m0, [scaleq + 2 * startq]
47 %endif
48 mov offsetd, [numq + startq]
49 mulss m0, [pf_inv16]
50 shl DICT, 5
51 shufps m0, m0, 0
52 %if cpuflag(sse2)
53 %if cpuflag(sse4)
54 pmovsxbd m1, [srcq + DICT + 0]
55 pmovsxbd m2, [srcq + DICT + 4]
56 %else
57 movq m1, [srcq + DICT]
58 punpcklbw m1, m1
59 mova m2, m1
60 punpcklwd m1, m1
61 punpckhwd m2, m2
62 psrad m1, 24
63 psrad m2, 24
64 %endif
65 cvtdq2ps m1, m1
66 cvtdq2ps m2, m2
67 %else
68 movd mm0, [srcq + DICT + 0]
69 movd mm1, [srcq + DICT + 4]
70 punpcklbw mm0, mm0
71 punpcklbw mm1, mm1
72 movq mm2, mm0
73 movq mm3, mm1
74 punpcklwd mm0, mm0
75 punpcklwd mm1, mm1
76 punpckhwd mm2, mm2
77 punpckhwd mm3, mm3
78 psrad mm0, 24
79 psrad mm1, 24
80 psrad mm2, 24
81 psrad mm3, 24
82 cvtpi2ps m1, mm0
83 cvtpi2ps m2, mm1
84 cvtpi2ps m3, mm2
85 cvtpi2ps m4, mm3
86 shufps m0, m0, 0
87 shufps m1, m3, q1010
88 shufps m2, m4, q1010
89 %endif
90 mulps m1, m0
91 mulps m2, m0
92 mova [dstq + 8 * startq + 0], m1
93 mova [dstq + 8 * startq + 16], m2
94 add startq, 4
95 cmp startq, endm
96 jl .loop
97 .end:
98 %if notcpuflag(sse2)
99 emms
100 %endif
101 REP_RET
102 %endmacro
103
104 %if ARCH_X86_32
105 INIT_XMM sse
106 DECODE_HF
107 %endif
108
109 INIT_XMM sse2
110 DECODE_HF
111
112 INIT_XMM sse4
113 DECODE_HF
114
115 ; %1=v0/v1 %2=in1 %3=in2
116 %macro FIR_LOOP 2-3
117 .loop%1:
118 %define va m1
119 %define vb m2
120 %if %1
121 %define OFFSET 0
122 %else
123 %define OFFSET NUM_COEF*count
124 %endif
125 ; for v0, incrementing and for v1, decrementing
126 mova va, [cf0q + OFFSET]
127 mova vb, [cf0q + OFFSET + 4*NUM_COEF]
128 %if %0 == 3
129 mova m4, [cf0q + OFFSET + mmsize]
130 mova m0, [cf0q + OFFSET + 4*NUM_COEF + mmsize]
131 %endif
132 mulps va, %2
133 mulps vb, %2
134 %if %0 == 3
135 mulps m4, %3
136 mulps m0, %3
137 addps va, m4
138 addps vb, m0
139 %endif
140 ; va = va1 va2 va3 va4
141 ; vb = vb1 vb2 vb3 vb4
142 %if %1
143 SWAP va, vb
144 %endif
145 mova m4, va
146 unpcklps va, vb ; va3 vb3 va4 vb4
147 unpckhps m4, vb ; va1 vb1 va2 vb2
148 addps m4, va ; va1+3 vb1+3 va2+4 vb2+4
149 movhlps vb, m4 ; va1+3 vb1+3
150 addps vb, m4 ; va0..4 vb0..4
151 movh [outq + count], vb
152 %if %1
153 sub cf0q, 8*NUM_COEF
154 %endif
155 add count, 8
156 jl .loop%1
157 %endmacro
158
159 ; void dca_lfe_fir(float *out, float *in, float *coefs)
160 %macro DCA_LFE_FIR 1
161 cglobal dca_lfe_fir%1, 3,3,6-%1, out, in, cf0
162 %define IN1 m3
163 %define IN2 m5
164 %define count inq
165 %define NUM_COEF 4*(2-%1)
166 %define NUM_OUT 32*(%1+1)
167
168 movu IN1, [inq + 4 - 1*mmsize]
169 shufps IN1, IN1, q0123
170 %if %1 == 0
171 movu IN2, [inq + 4 - 2*mmsize]
172 shufps IN2, IN2, q0123
173 %endif
174
175 mov count, -4*NUM_OUT
176 add cf0q, 4*NUM_COEF*NUM_OUT
177 add outq, 4*NUM_OUT
178 ; compute v0 first
179 %if %1 == 0
180 FIR_LOOP 0, IN1, IN2
181 %else
182 FIR_LOOP 0, IN1
183 %endif
184 shufps IN1, IN1, q0123
185 mov count, -4*NUM_OUT
186 ; cf1 already correctly positioned
187 add outq, 4*NUM_OUT ; outq now at out2
188 sub cf0q, 8*NUM_COEF
189 %if %1 == 0
190 shufps IN2, IN2, q0123
191 FIR_LOOP 1, IN2, IN1
192 %else
193 FIR_LOOP 1, IN1
194 %endif
195 RET
196 %endmacro
197
198 INIT_XMM sse
199 DCA_LFE_FIR 0
200 DCA_LFE_FIR 1
201
202 %macro SETZERO 1
203 %if cpuflag(sse2) && notcpuflag(avx)
204 pxor %1, %1
205 %else
206 xorps %1, %1, %1
207 %endif
208 %endmacro
209
210 %macro SHUF 3
211 %if cpuflag(avx)
212 mova %3, [%2 - 16]
213 vperm2f128 %1, %3, %3, 1
214 vshufps %1, %1, %1, q0123
215 %elif cpuflag(sse2)
216 pshufd %1, [%2], q0123
217 %else
218 mova %1, [%2]
219 shufps %1, %1, q0123
220 %endif
221 %endmacro
222
223 %macro INNER_LOOP 1
224 ; reading backwards: ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
225 ;~ a += window[i + j] * (-synth_buf[15 - i + j])
226 ;~ b += window[i + j + 16] * (synth_buf[i + j])
227 SHUF m5, ptr2 + j + (15 - 3) * 4, m6
228 mova m6, [ptr1 + j]
229 %if ARCH_X86_64
230 SHUF m11, ptr2 + j + (15 - 3) * 4 - mmsize, m12
231 mova m12, [ptr1 + j + mmsize]
232 %endif
233 %if cpuflag(fma3)
234 fmaddps m2, m6, [win + %1 + j + 16 * 4], m2
235 fnmaddps m1, m5, [win + %1 + j], m1
236 %if ARCH_X86_64
237 fmaddps m8, m12, [win + %1 + j + mmsize + 16 * 4], m8
238 fnmaddps m7, m11, [win + %1 + j + mmsize], m7
239 %endif
240 %else ; non-FMA
241 mulps m6, m6, [win + %1 + j + 16 * 4]
242 mulps m5, m5, [win + %1 + j]
243 %if ARCH_X86_64
244 mulps m12, m12, [win + %1 + j + mmsize + 16 * 4]
245 mulps m11, m11, [win + %1 + j + mmsize]
246 %endif
247 addps m2, m2, m6
248 subps m1, m1, m5
249 %if ARCH_X86_64
250 addps m8, m8, m12
251 subps m7, m7, m11
252 %endif
253 %endif ; cpuflag(fma3)
254 ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
255 ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
256 SHUF m6, ptr2 + j + (31 - 3) * 4, m5
257 mova m5, [ptr1 + j + 16 * 4]
258 %if ARCH_X86_64
259 SHUF m12, ptr2 + j + (31 - 3) * 4 - mmsize, m11
260 mova m11, [ptr1 + j + mmsize + 16 * 4]
261 %endif
262 %if cpuflag(fma3)
263 fmaddps m3, m5, [win + %1 + j + 32 * 4], m3
264 fmaddps m4, m6, [win + %1 + j + 48 * 4], m4
265 %if ARCH_X86_64
266 fmaddps m9, m11, [win + %1 + j + mmsize + 32 * 4], m9
267 fmaddps m10, m12, [win + %1 + j + mmsize + 48 * 4], m10
268 %endif
269 %else ; non-FMA
270 mulps m5, m5, [win + %1 + j + 32 * 4]
271 mulps m6, m6, [win + %1 + j + 48 * 4]
272 %if ARCH_X86_64
273 mulps m11, m11, [win + %1 + j + mmsize + 32 * 4]
274 mulps m12, m12, [win + %1 + j + mmsize + 48 * 4]
275 %endif
276 addps m3, m3, m5
277 addps m4, m4, m6
278 %if ARCH_X86_64
279 addps m9, m9, m11
280 addps m10, m10, m12
281 %endif
282 %endif ; cpuflag(fma3)
283 sub j, 64 * 4
284 %endmacro
285
286 ; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
287 ; const float window[512], float out[32],
288 ; intptr_t offset, float scale)
289 %macro SYNTH_FILTER 0
290 cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
291 synth_buf, synth_buf2, window, out, off, scale
292 %define scale m0
293 %if ARCH_X86_32 || WIN64
294 %if cpuflag(sse2) && notcpuflag(avx)
295 movd scale, scalem
296 SPLATD m0
297 %else
298 VBROADCASTSS m0, scalem
299 %endif
300 ; Make sure offset is in a register and not on the stack
301 %define OFFQ r4q
302 %else
303 SPLATD xmm0
304 %if cpuflag(avx)
305 vinsertf128 m0, m0, xmm0, 1
306 %endif
307 %define OFFQ offq
308 %endif
309 ; prepare inner counter limit 1
310 mov r5q, 480
311 sub r5q, offmp
312 and r5q, -64
313 shl r5q, 2
314 %if ARCH_X86_32 || notcpuflag(avx)
315 mov OFFQ, r5q
316 %define i r5q
317 mov i, 16 * 4 - (ARCH_X86_64 + 1) * mmsize ; main loop counter
318 %else
319 %define i 0
320 %define OFFQ r5q
321 %endif
322
323 %define buf2 synth_buf2q
324 %if ARCH_X86_32
325 mov buf2, synth_buf2mp
326 %endif
327 .mainloop
328 ; m1 = a m2 = b m3 = c m4 = d
329 SETZERO m3
330 SETZERO m4
331 mova m1, [buf2 + i]
332 mova m2, [buf2 + i + 16 * 4]
333 %if ARCH_X86_32
334 %define ptr1 r0q
335 %define ptr2 r1q
336 %define win r2q
337 %define j r3q
338 mov win, windowm
339 mov ptr1, synth_bufm
340 %if ARCH_X86_32 || notcpuflag(avx)
341 add win, i
342 add ptr1, i
343 %endif
344 %else ; ARCH_X86_64
345 %define ptr1 r6q
346 %define ptr2 r7q ; must be loaded
347 %define win r8q
348 %define j r9q
349 SETZERO m9
350 SETZERO m10
351 mova m7, [buf2 + i + mmsize]
352 mova m8, [buf2 + i + mmsize + 16 * 4]
353 lea win, [windowq + i]
354 lea ptr1, [synth_bufq + i]
355 %endif
356 mov ptr2, synth_bufmp
357 ; prepare the inner loop counter
358 mov j, OFFQ
359 %if ARCH_X86_32 || notcpuflag(avx)
360 sub ptr2, i
361 %endif
362 .loop1:
363 INNER_LOOP 0
364 jge .loop1
365
366 mov j, 448 * 4
367 sub j, OFFQ
368 jz .end
369 sub ptr1, j
370 sub ptr2, j
371 add win, OFFQ ; now at j-64, so define OFFSET
372 sub j, 64 * 4
373 .loop2:
374 INNER_LOOP 64 * 4
375 jge .loop2
376
377 .end:
378 %if ARCH_X86_32
379 mov buf2, synth_buf2m ; needed for next iteration anyway
380 mov outq, outmp ; j, which will be set again during it
381 %endif
382 ;~ out[i] = a * scale;
383 ;~ out[i + 16] = b * scale;
384 mulps m1, m1, scale
385 mulps m2, m2, scale
386 %if ARCH_X86_64
387 mulps m7, m7, scale
388 mulps m8, m8, scale
389 %endif
390 ;~ synth_buf2[i] = c;
391 ;~ synth_buf2[i + 16] = d;
392 mova [buf2 + i + 0 * 4], m3
393 mova [buf2 + i + 16 * 4], m4
394 %if ARCH_X86_64
395 mova [buf2 + i + 0 * 4 + mmsize], m9
396 mova [buf2 + i + 16 * 4 + mmsize], m10
397 %endif
398 ;~ out[i] = a;
399 ;~ out[i + 16] = a;
400 mova [outq + i + 0 * 4], m1
401 mova [outq + i + 16 * 4], m2
402 %if ARCH_X86_64
403 mova [outq + i + 0 * 4 + mmsize], m7
404 mova [outq + i + 16 * 4 + mmsize], m8
405 %endif
406 %if ARCH_X86_32 || notcpuflag(avx)
407 sub i, (ARCH_X86_64 + 1) * mmsize
408 jge .mainloop
409 %endif
410 RET
411 %endmacro
412
413 %if ARCH_X86_32
414 INIT_XMM sse
415 SYNTH_FILTER
416 %endif
417 INIT_XMM sse2
418 SYNTH_FILTER
419 INIT_YMM avx
420 SYNTH_FILTER
421 INIT_YMM fma3
422 SYNTH_FILTER