x86: xvid: K&R formatting cosmetics
[libav.git] / libavcodec / x86 / xvididct_sse2.c
CommitLineData
f73a6393
AS
1/*
2 * XVID MPEG-4 VIDEO CODEC
3 * - SSE2 inverse discrete cosine transform -
4 *
5 * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
6 *
7 * Conversion to gcc syntax with modifications
8 * by Alexander Strange <astrange@ithinksw.com>
9 *
10 * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
11 *
2912e87a 12 * This file is part of Libav.
f73a6393
AS
13 *
14 * Vertical pass is an implementation of the scheme:
15 * Loeffler C., Ligtenberg A., and Moschytz C.S.:
16 * Practical Fast 1D DCT Algorithm with Eleven Multiplications,
17 * Proc. ICASSP 1989, 988-991.
18 *
19 * Horizontal pass is a double 4x4 vector/matrix multiplication,
20 * (see also Intel's Application Note 922:
21 * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
22 * Copyright (C) 1999 Intel Corporation)
23 *
24 * More details at http://skal.planet-d.net/coding/dct.html
25 *
2912e87a 26 * Libav is free software; you can redistribute it and/or
f73a6393
AS
27 * modify it under the terms of the GNU Lesser General Public
28 * License as published by the Free Software Foundation; either
29 * version 2.1 of the License, or (at your option) any later version.
30 *
2912e87a 31 * Libav is distributed in the hope that it will be useful,
f73a6393
AS
32 * but WITHOUT ANY WARRANTY; without even the implied warranty of
33 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
34 * Lesser General Public License for more details.
35 *
36 * You should have received a copy of the GNU Lesser General Public License
2912e87a 37 * along with Libav; if not, write to the Free Software Foundation,
f73a6393
AS
38 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
39 */
40
1d9c2dc8
MS
41#include "libavutil/internal.h"
42#include "libavutil/mem.h"
c318626c 43#include "libavutil/x86/asm.h"
dcb7c868 44
e3fcb143 45#include "idctdsp.h"
dcb7c868 46#include "xvididct.h"
f73a6393 47
0b8b2ae5 48#if HAVE_SSE2_INLINE
79195ce5 49
adbfc605 50/**
ba87f080 51 * @file
d35b94fb 52 * @brief SSE2 IDCT compatible with the Xvid IDCT
f73a6393
AS
53 */
54
8d27bf1c 55#define X8(x) x, x, x, x, x, x, x, x
f73a6393 56
8d27bf1c
DB
57DECLARE_ASM_CONST(16, int16_t, tan1)[] = { X8(13036) }; // tan( pi/16)
58DECLARE_ASM_CONST(16, int16_t, tan2)[] = { X8(27146) }; // tan(2pi/16) = sqrt(2)-1
59DECLARE_ASM_CONST(16, int16_t, tan3)[] = { X8(43790) }; // tan(3pi/16)-1
60DECLARE_ASM_CONST(16, int16_t, sqrt2)[] = { X8(23170) }; // 0.5/sqrt(2)
61DECLARE_ASM_CONST(8, uint8_t, m127)[] = { X8(127) };
f73a6393 62
c6727809 63DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
8d27bf1c
DB
64 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
65 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
66 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
67 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
f73a6393
AS
68};
69
c6727809 70DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
8d27bf1c
DB
71 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
72 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
73 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
74 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
f73a6393
AS
75};
76
c6727809 77DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
8d27bf1c
DB
78 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
79 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
80 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
81 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
f73a6393
AS
82};
83
c6727809 84DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
8d27bf1c
DB
85 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
86 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
87 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
88 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
f73a6393
AS
89};
90
c6727809 91DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
8d27bf1c
DB
92 65536, 65536, 65536, 65536,
93 3597, 3597, 3597, 3597,
94 2260, 2260, 2260, 2260,
95 1203, 1203, 1203, 1203,
96 120, 120, 120, 120,
97 512, 512, 512, 512
f73a6393
AS
98};
99
100// Temporary storage before the column pass
101#define ROW1 "%%xmm6"
102#define ROW3 "%%xmm4"
103#define ROW5 "%%xmm5"
104#define ROW7 "%%xmm7"
105
106#define CLEAR_ODD(r) "pxor "r","r" \n\t"
107#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
108
b250f9c6 109#if ARCH_X86_64
f73a6393
AS
110
111# define ROW0 "%%xmm8"
112# define REG0 ROW0
113# define ROW2 "%%xmm9"
114# define REG2 ROW2
115# define ROW4 "%%xmm10"
116# define REG4 ROW4
117# define ROW6 "%%xmm11"
118# define REG6 ROW6
119# define CLEAR_EVEN(r) CLEAR_ODD(r)
120# define PUT_EVEN(dst) PUT_ODD(dst)
121# define XMMS "%%xmm12"
122# define MOV_32_ONLY "#"
123# define SREG2 REG2
124# define TAN3 "%%xmm13"
125# define TAN1 "%%xmm14"
126
127#else
128
129# define ROW0 "(%0)"
130# define REG0 "%%xmm4"
131# define ROW2 "2*16(%0)"
132# define REG2 "%%xmm4"
133# define ROW4 "4*16(%0)"
134# define REG4 "%%xmm6"
135# define ROW6 "6*16(%0)"
136# define REG6 "%%xmm6"
137# define CLEAR_EVEN(r)
138# define PUT_EVEN(dst) \
139 "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
140 "movdqa %%xmm2, "dst" \n\t"
141# define XMMS "%%xmm2"
142# define MOV_32_ONLY "movdqa "
143# define SREG2 "%%xmm7"
144# define TAN3 "%%xmm0"
145# define TAN1 "%%xmm2"
146
147#endif
148
149#define ROUND(x) "paddd "MANGLE(x)
150
151#define JZ(reg, to) \
152 "testl "reg","reg" \n\t" \
153 "jz "to" \n\t"
154
155#define JNZ(reg, to) \
156 "testl "reg","reg" \n\t" \
157 "jnz "to" \n\t"
158
159#define TEST_ONE_ROW(src, reg, clear) \
160 clear \
161 "movq "src", %%mm1 \n\t" \
162 "por 8+"src", %%mm1 \n\t" \
163 "paddusb %%mm0, %%mm1 \n\t" \
164 "pmovmskb %%mm1, "reg" \n\t"
165
166#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
8d27bf1c
DB
167 clear1 \
168 clear2 \
169 "movq "row1", %%mm1 \n\t" \
170 "por 8+"row1", %%mm1 \n\t" \
171 "movq "row2", %%mm2 \n\t" \
172 "por 8+"row2", %%mm2 \n\t" \
173 "paddusb %%mm0, %%mm1 \n\t" \
174 "paddusb %%mm0, %%mm2 \n\t" \
175 "pmovmskb %%mm1, "reg1" \n\t" \
f73a6393
AS
176 "pmovmskb %%mm2, "reg2" \n\t"
177
8d27bf1c
DB
178/// IDCT pass on rows.
179#define iMTX_MULT(src, table, rounder, put) \
180 "movdqa "src", %%xmm3 \n\t" \
181 "movdqa %%xmm3, %%xmm0 \n\t" \
f73a6393
AS
182 "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \
183 "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \
8d27bf1c
DB
184 "pmaddwd "table", %%xmm0 \n\t" \
185 "pmaddwd 16+"table", %%xmm1 \n\t" \
f73a6393
AS
186 "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \
187 "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \
8d27bf1c
DB
188 "pmaddwd 32+"table", %%xmm2 \n\t" \
189 "pmaddwd 48+"table", %%xmm3 \n\t" \
190 "paddd %%xmm1, %%xmm0 \n\t" \
191 "paddd %%xmm3, %%xmm2 \n\t" \
192 rounder", %%xmm0 \n\t" \
193 "movdqa %%xmm2, %%xmm3 \n\t" \
194 "paddd %%xmm0, %%xmm2 \n\t" \
195 "psubd %%xmm3, %%xmm0 \n\t" \
196 "psrad $11, %%xmm2 \n\t" \
197 "psrad $11, %%xmm0 \n\t" \
198 "packssdw %%xmm0, %%xmm2 \n\t" \
199 put \
f73a6393
AS
200 "1: \n\t"
201
202#define iLLM_HEAD \
203 "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
204 "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
205
8d27bf1c 206/// IDCT pass on columns.
f73a6393
AS
207#define iLLM_PASS(dct) \
208 "movdqa "TAN3", %%xmm1 \n\t" \
209 "movdqa "TAN1", %%xmm3 \n\t" \
210 "pmulhw %%xmm4, "TAN3" \n\t" \
211 "pmulhw %%xmm5, %%xmm1 \n\t" \
212 "paddsw %%xmm4, "TAN3" \n\t" \
213 "paddsw %%xmm5, %%xmm1 \n\t" \
214 "psubsw %%xmm5, "TAN3" \n\t" \
215 "paddsw %%xmm4, %%xmm1 \n\t" \
216 "pmulhw %%xmm7, %%xmm3 \n\t" \
217 "pmulhw %%xmm6, "TAN1" \n\t" \
218 "paddsw %%xmm6, %%xmm3 \n\t" \
219 "psubsw %%xmm7, "TAN1" \n\t" \
220 "movdqa %%xmm3, %%xmm7 \n\t" \
221 "movdqa "TAN1", %%xmm6 \n\t" \
222 "psubsw %%xmm1, %%xmm3 \n\t" \
223 "psubsw "TAN3", "TAN1" \n\t" \
224 "paddsw %%xmm7, %%xmm1 \n\t" \
225 "paddsw %%xmm6, "TAN3" \n\t" \
226 "movdqa %%xmm3, %%xmm6 \n\t" \
227 "psubsw "TAN3", %%xmm3 \n\t" \
228 "paddsw %%xmm6, "TAN3" \n\t" \
229 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
230 "pmulhw %%xmm4, %%xmm3 \n\t" \
231 "pmulhw %%xmm4, "TAN3" \n\t" \
232 "paddsw "TAN3", "TAN3" \n\t" \
233 "paddsw %%xmm3, %%xmm3 \n\t" \
234 "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
235 MOV_32_ONLY ROW2", "REG2" \n\t" \
236 MOV_32_ONLY ROW6", "REG6" \n\t" \
237 "movdqa %%xmm7, %%xmm5 \n\t" \
238 "pmulhw "REG6", %%xmm7 \n\t" \
239 "pmulhw "REG2", %%xmm5 \n\t" \
240 "paddsw "REG2", %%xmm7 \n\t" \
241 "psubsw "REG6", %%xmm5 \n\t" \
242 MOV_32_ONLY ROW0", "REG0" \n\t" \
243 MOV_32_ONLY ROW4", "REG4" \n\t" \
244 MOV_32_ONLY" "TAN1", (%0) \n\t" \
245 "movdqa "REG0", "XMMS" \n\t" \
246 "psubsw "REG4", "REG0" \n\t" \
247 "paddsw "XMMS", "REG4" \n\t" \
248 "movdqa "REG4", "XMMS" \n\t" \
249 "psubsw %%xmm7, "REG4" \n\t" \
250 "paddsw "XMMS", %%xmm7 \n\t" \
251 "movdqa "REG0", "XMMS" \n\t" \
252 "psubsw %%xmm5, "REG0" \n\t" \
253 "paddsw "XMMS", %%xmm5 \n\t" \
254 "movdqa %%xmm5, "XMMS" \n\t" \
255 "psubsw "TAN3", %%xmm5 \n\t" \
256 "paddsw "XMMS", "TAN3" \n\t" \
257 "movdqa "REG0", "XMMS" \n\t" \
258 "psubsw %%xmm3, "REG0" \n\t" \
259 "paddsw "XMMS", %%xmm3 \n\t" \
260 MOV_32_ONLY" (%0), "TAN1" \n\t" \
261 "psraw $6, %%xmm5 \n\t" \
262 "psraw $6, "REG0" \n\t" \
263 "psraw $6, "TAN3" \n\t" \
264 "psraw $6, %%xmm3 \n\t" \
265 "movdqa "TAN3", 1*16("dct") \n\t" \
266 "movdqa %%xmm3, 2*16("dct") \n\t" \
267 "movdqa "REG0", 5*16("dct") \n\t" \
268 "movdqa %%xmm5, 6*16("dct") \n\t" \
269 "movdqa %%xmm7, %%xmm0 \n\t" \
270 "movdqa "REG4", %%xmm4 \n\t" \
271 "psubsw %%xmm1, %%xmm7 \n\t" \
272 "psubsw "TAN1", "REG4" \n\t" \
273 "paddsw %%xmm0, %%xmm1 \n\t" \
274 "paddsw %%xmm4, "TAN1" \n\t" \
275 "psraw $6, %%xmm1 \n\t" \
276 "psraw $6, %%xmm7 \n\t" \
277 "psraw $6, "TAN1" \n\t" \
278 "psraw $6, "REG4" \n\t" \
279 "movdqa %%xmm1, ("dct") \n\t" \
280 "movdqa "TAN1", 3*16("dct") \n\t" \
281 "movdqa "REG4", 4*16("dct") \n\t" \
282 "movdqa %%xmm7, 7*16("dct") \n\t"
283
8d27bf1c 284/// IDCT pass on columns, assuming rows 4-7 are zero.
f73a6393
AS
285#define iLLM_PASS_SPARSE(dct) \
286 "pmulhw %%xmm4, "TAN3" \n\t" \
287 "paddsw %%xmm4, "TAN3" \n\t" \
288 "movdqa %%xmm6, %%xmm3 \n\t" \
289 "pmulhw %%xmm6, "TAN1" \n\t" \
290 "movdqa %%xmm4, %%xmm1 \n\t" \
291 "psubsw %%xmm1, %%xmm3 \n\t" \
292 "paddsw %%xmm6, %%xmm1 \n\t" \
293 "movdqa "TAN1", %%xmm6 \n\t" \
294 "psubsw "TAN3", "TAN1" \n\t" \
295 "paddsw %%xmm6, "TAN3" \n\t" \
296 "movdqa %%xmm3, %%xmm6 \n\t" \
297 "psubsw "TAN3", %%xmm3 \n\t" \
298 "paddsw %%xmm6, "TAN3" \n\t" \
299 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
300 "pmulhw %%xmm4, %%xmm3 \n\t" \
301 "pmulhw %%xmm4, "TAN3" \n\t" \
302 "paddsw "TAN3", "TAN3" \n\t" \
303 "paddsw %%xmm3, %%xmm3 \n\t" \
304 "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
305 MOV_32_ONLY ROW2", "SREG2" \n\t" \
306 "pmulhw "SREG2", %%xmm5 \n\t" \
307 MOV_32_ONLY ROW0", "REG0" \n\t" \
308 "movdqa "REG0", %%xmm6 \n\t" \
309 "psubsw "SREG2", %%xmm6 \n\t" \
310 "paddsw "REG0", "SREG2" \n\t" \
311 MOV_32_ONLY" "TAN1", (%0) \n\t" \
312 "movdqa "REG0", "XMMS" \n\t" \
313 "psubsw %%xmm5, "REG0" \n\t" \
314 "paddsw "XMMS", %%xmm5 \n\t" \
315 "movdqa %%xmm5, "XMMS" \n\t" \
316 "psubsw "TAN3", %%xmm5 \n\t" \
317 "paddsw "XMMS", "TAN3" \n\t" \
318 "movdqa "REG0", "XMMS" \n\t" \
319 "psubsw %%xmm3, "REG0" \n\t" \
320 "paddsw "XMMS", %%xmm3 \n\t" \
321 MOV_32_ONLY" (%0), "TAN1" \n\t" \
322 "psraw $6, %%xmm5 \n\t" \
323 "psraw $6, "REG0" \n\t" \
324 "psraw $6, "TAN3" \n\t" \
325 "psraw $6, %%xmm3 \n\t" \
326 "movdqa "TAN3", 1*16("dct") \n\t" \
327 "movdqa %%xmm3, 2*16("dct") \n\t" \
328 "movdqa "REG0", 5*16("dct") \n\t" \
329 "movdqa %%xmm5, 6*16("dct") \n\t" \
330 "movdqa "SREG2", %%xmm0 \n\t" \
331 "movdqa %%xmm6, %%xmm4 \n\t" \
332 "psubsw %%xmm1, "SREG2" \n\t" \
333 "psubsw "TAN1", %%xmm6 \n\t" \
334 "paddsw %%xmm0, %%xmm1 \n\t" \
335 "paddsw %%xmm4, "TAN1" \n\t" \
336 "psraw $6, %%xmm1 \n\t" \
337 "psraw $6, "SREG2" \n\t" \
338 "psraw $6, "TAN1" \n\t" \
339 "psraw $6, %%xmm6 \n\t" \
340 "movdqa %%xmm1, ("dct") \n\t" \
341 "movdqa "TAN1", 3*16("dct") \n\t" \
342 "movdqa %%xmm6, 4*16("dct") \n\t" \
343 "movdqa "SREG2", 7*16("dct") \n\t"
344
dcb7c868 345inline void ff_xvid_idct_sse2(short *block)
f73a6393 346{
8d27bf1c
DB
347 __asm__ volatile (
348 "movq "MANGLE (m127) ", %%mm0 \n\t"
349 iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
350 iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 1 * 16), PUT_ODD(ROW1))
351 iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 2 * 16), PUT_EVEN(ROW2))
352
353 TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
354 JZ("%%eax", "1f")
355 iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 3 * 16), PUT_ODD(ROW3))
356
357 TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
358 TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
359 iLLM_HEAD
360 ".p2align 4 \n\t"
361 JNZ("%%ecx", "2f")
362 JNZ("%%eax", "3f")
363 JNZ("%%edx", "4f")
364 JNZ("%%esi", "5f")
365 iLLM_PASS_SPARSE("%0")
366 "jmp 6f \n\t"
367 "2: \n\t"
368 iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
369 "3: \n\t"
370 iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 4 * 16), PUT_ODD(ROW5))
371 JZ("%%edx", "1f")
372 "4: \n\t"
373 iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 5 * 16), PUT_EVEN(ROW6))
374 JZ("%%esi", "1f")
375 "5: \n\t"
376 iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 5 * 16), PUT_ODD(ROW7))
c59211b4 377#if ARCH_X86_32
8d27bf1c 378 iLLM_HEAD
f73a6393 379#endif
8d27bf1c
DB
380 iLLM_PASS("%0")
381 "6: \n\t"
382 : "+r" (block)
383 :
384 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
385 "%xmm4", "%xmm5", "%xmm6", "%xmm7", )
ba404520 386#if ARCH_X86_64
8d27bf1c
DB
387 XMM_CLOBBERS("%xmm8", "%xmm9", "%xmm10", "%xmm11",
388 "%xmm12", "%xmm13", "%xmm14", )
ba404520 389#endif
8d27bf1c 390 "%eax", "%ecx", "%edx", "%esi", "memory");
f73a6393
AS
391}
392
dcb7c868 393void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block)
f73a6393 394{
dcb7c868 395 ff_xvid_idct_sse2(block);
7e7c4b60 396 ff_put_pixels_clamped_mmx(block, dest, line_size);
f73a6393
AS
397}
398
dcb7c868 399void ff_xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
f73a6393 400{
dcb7c868 401 ff_xvid_idct_sse2(block);
7e7c4b60 402 ff_add_pixels_clamped_mmx(block, dest, line_size);
f73a6393 403}
79195ce5 404
0b8b2ae5 405#endif /* HAVE_SSE2_INLINE */