avcodec: Rename xvidmmx IDCT to xvid
[libav.git] / libavcodec / x86 / idct_sse2_xvid.c
CommitLineData
f73a6393
AS
1/*
2 * XVID MPEG-4 VIDEO CODEC
3 * - SSE2 inverse discrete cosine transform -
4 *
5 * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
6 *
7 * Conversion to gcc syntax with modifications
8 * by Alexander Strange <astrange@ithinksw.com>
9 *
10 * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
11 *
2912e87a 12 * This file is part of Libav.
f73a6393
AS
13 *
14 * Vertical pass is an implementation of the scheme:
15 * Loeffler C., Ligtenberg A., and Moschytz C.S.:
16 * Practical Fast 1D DCT Algorithm with Eleven Multiplications,
17 * Proc. ICASSP 1989, 988-991.
18 *
19 * Horizontal pass is a double 4x4 vector/matrix multiplication,
20 * (see also Intel's Application Note 922:
21 * http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
22 * Copyright (C) 1999 Intel Corporation)
23 *
24 * More details at http://skal.planet-d.net/coding/dct.html
25 *
2912e87a 26 * Libav is free software; you can redistribute it and/or
f73a6393
AS
27 * modify it under the terms of the GNU Lesser General Public
28 * License as published by the Free Software Foundation; either
29 * version 2.1 of the License, or (at your option) any later version.
30 *
2912e87a 31 * Libav is distributed in the hope that it will be useful,
f73a6393
AS
32 * but WITHOUT ANY WARRANTY; without even the implied warranty of
33 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
34 * Lesser General Public License for more details.
35 *
36 * You should have received a copy of the GNU Lesser General Public License
2912e87a 37 * along with Libav; if not, write to the Free Software Foundation,
f73a6393
AS
38 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
39 */
40
1d9c2dc8
MS
41#include "libavutil/internal.h"
42#include "libavutil/mem.h"
c318626c 43#include "libavutil/x86/asm.h"
c4ff7c53 44#include "idct_xvid.h"
e3fcb143 45#include "idctdsp.h"
f73a6393 46
0b8b2ae5 47#if HAVE_SSE2_INLINE
79195ce5 48
adbfc605 49/**
ba87f080 50 * @file
d35b94fb 51 * @brief SSE2 IDCT compatible with the Xvid IDCT
f73a6393
AS
52 */
53
54#define X8(x) x,x,x,x,x,x,x,x
55
56#define ROW_SHIFT 11
57#define COL_SHIFT 6
58
c6727809
MR
59DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16)
60DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
61DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1
62DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2)
63DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)};
f73a6393 64
c6727809 65DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
f73a6393
AS
66 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
67 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
68 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
69 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
70};
71
c6727809 72DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
f73a6393
AS
73 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
74 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
75 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
76 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
77};
78
c6727809 79DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
f73a6393
AS
80 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
81 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
82 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
83 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
84};
85
c6727809 86DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
f73a6393
AS
87 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
88 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
89 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
90 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
91};
92
c6727809 93DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
f73a6393
AS
94 65536, 65536, 65536, 65536,
95 3597, 3597, 3597, 3597,
96 2260, 2260, 2260, 2260,
97 1203, 1203, 1203, 1203,
98 120, 120, 120, 120,
99 512, 512, 512, 512
100};
101
102// Temporary storage before the column pass
103#define ROW1 "%%xmm6"
104#define ROW3 "%%xmm4"
105#define ROW5 "%%xmm5"
106#define ROW7 "%%xmm7"
107
108#define CLEAR_ODD(r) "pxor "r","r" \n\t"
109#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
110
b250f9c6 111#if ARCH_X86_64
f73a6393
AS
112
113# define ROW0 "%%xmm8"
114# define REG0 ROW0
115# define ROW2 "%%xmm9"
116# define REG2 ROW2
117# define ROW4 "%%xmm10"
118# define REG4 ROW4
119# define ROW6 "%%xmm11"
120# define REG6 ROW6
121# define CLEAR_EVEN(r) CLEAR_ODD(r)
122# define PUT_EVEN(dst) PUT_ODD(dst)
123# define XMMS "%%xmm12"
124# define MOV_32_ONLY "#"
125# define SREG2 REG2
126# define TAN3 "%%xmm13"
127# define TAN1 "%%xmm14"
128
129#else
130
131# define ROW0 "(%0)"
132# define REG0 "%%xmm4"
133# define ROW2 "2*16(%0)"
134# define REG2 "%%xmm4"
135# define ROW4 "4*16(%0)"
136# define REG4 "%%xmm6"
137# define ROW6 "6*16(%0)"
138# define REG6 "%%xmm6"
139# define CLEAR_EVEN(r)
140# define PUT_EVEN(dst) \
141 "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
142 "movdqa %%xmm2, "dst" \n\t"
143# define XMMS "%%xmm2"
144# define MOV_32_ONLY "movdqa "
145# define SREG2 "%%xmm7"
146# define TAN3 "%%xmm0"
147# define TAN1 "%%xmm2"
148
149#endif
150
151#define ROUND(x) "paddd "MANGLE(x)
152
153#define JZ(reg, to) \
154 "testl "reg","reg" \n\t" \
155 "jz "to" \n\t"
156
157#define JNZ(reg, to) \
158 "testl "reg","reg" \n\t" \
159 "jnz "to" \n\t"
160
161#define TEST_ONE_ROW(src, reg, clear) \
162 clear \
163 "movq "src", %%mm1 \n\t" \
164 "por 8+"src", %%mm1 \n\t" \
165 "paddusb %%mm0, %%mm1 \n\t" \
166 "pmovmskb %%mm1, "reg" \n\t"
167
168#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
169 clear1 \
170 clear2 \
171 "movq "row1", %%mm1 \n\t" \
172 "por 8+"row1", %%mm1 \n\t" \
173 "movq "row2", %%mm2 \n\t" \
174 "por 8+"row2", %%mm2 \n\t" \
175 "paddusb %%mm0, %%mm1 \n\t" \
176 "paddusb %%mm0, %%mm2 \n\t" \
177 "pmovmskb %%mm1, "reg1" \n\t" \
178 "pmovmskb %%mm2, "reg2" \n\t"
179
180///IDCT pass on rows.
181#define iMTX_MULT(src, table, rounder, put) \
182 "movdqa "src", %%xmm3 \n\t" \
183 "movdqa %%xmm3, %%xmm0 \n\t" \
184 "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \
185 "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \
186 "pmaddwd "table", %%xmm0 \n\t" \
187 "pmaddwd 16+"table", %%xmm1 \n\t" \
188 "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \
189 "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \
190 "pmaddwd 32+"table", %%xmm2 \n\t" \
191 "pmaddwd 48+"table", %%xmm3 \n\t" \
192 "paddd %%xmm1, %%xmm0 \n\t" \
193 "paddd %%xmm3, %%xmm2 \n\t" \
194 rounder", %%xmm0 \n\t" \
195 "movdqa %%xmm2, %%xmm3 \n\t" \
196 "paddd %%xmm0, %%xmm2 \n\t" \
197 "psubd %%xmm3, %%xmm0 \n\t" \
198 "psrad $11, %%xmm2 \n\t" \
199 "psrad $11, %%xmm0 \n\t" \
200 "packssdw %%xmm0, %%xmm2 \n\t" \
201 put \
202 "1: \n\t"
203
204#define iLLM_HEAD \
205 "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
206 "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
207
208///IDCT pass on columns.
209#define iLLM_PASS(dct) \
210 "movdqa "TAN3", %%xmm1 \n\t" \
211 "movdqa "TAN1", %%xmm3 \n\t" \
212 "pmulhw %%xmm4, "TAN3" \n\t" \
213 "pmulhw %%xmm5, %%xmm1 \n\t" \
214 "paddsw %%xmm4, "TAN3" \n\t" \
215 "paddsw %%xmm5, %%xmm1 \n\t" \
216 "psubsw %%xmm5, "TAN3" \n\t" \
217 "paddsw %%xmm4, %%xmm1 \n\t" \
218 "pmulhw %%xmm7, %%xmm3 \n\t" \
219 "pmulhw %%xmm6, "TAN1" \n\t" \
220 "paddsw %%xmm6, %%xmm3 \n\t" \
221 "psubsw %%xmm7, "TAN1" \n\t" \
222 "movdqa %%xmm3, %%xmm7 \n\t" \
223 "movdqa "TAN1", %%xmm6 \n\t" \
224 "psubsw %%xmm1, %%xmm3 \n\t" \
225 "psubsw "TAN3", "TAN1" \n\t" \
226 "paddsw %%xmm7, %%xmm1 \n\t" \
227 "paddsw %%xmm6, "TAN3" \n\t" \
228 "movdqa %%xmm3, %%xmm6 \n\t" \
229 "psubsw "TAN3", %%xmm3 \n\t" \
230 "paddsw %%xmm6, "TAN3" \n\t" \
231 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
232 "pmulhw %%xmm4, %%xmm3 \n\t" \
233 "pmulhw %%xmm4, "TAN3" \n\t" \
234 "paddsw "TAN3", "TAN3" \n\t" \
235 "paddsw %%xmm3, %%xmm3 \n\t" \
236 "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
237 MOV_32_ONLY ROW2", "REG2" \n\t" \
238 MOV_32_ONLY ROW6", "REG6" \n\t" \
239 "movdqa %%xmm7, %%xmm5 \n\t" \
240 "pmulhw "REG6", %%xmm7 \n\t" \
241 "pmulhw "REG2", %%xmm5 \n\t" \
242 "paddsw "REG2", %%xmm7 \n\t" \
243 "psubsw "REG6", %%xmm5 \n\t" \
244 MOV_32_ONLY ROW0", "REG0" \n\t" \
245 MOV_32_ONLY ROW4", "REG4" \n\t" \
246 MOV_32_ONLY" "TAN1", (%0) \n\t" \
247 "movdqa "REG0", "XMMS" \n\t" \
248 "psubsw "REG4", "REG0" \n\t" \
249 "paddsw "XMMS", "REG4" \n\t" \
250 "movdqa "REG4", "XMMS" \n\t" \
251 "psubsw %%xmm7, "REG4" \n\t" \
252 "paddsw "XMMS", %%xmm7 \n\t" \
253 "movdqa "REG0", "XMMS" \n\t" \
254 "psubsw %%xmm5, "REG0" \n\t" \
255 "paddsw "XMMS", %%xmm5 \n\t" \
256 "movdqa %%xmm5, "XMMS" \n\t" \
257 "psubsw "TAN3", %%xmm5 \n\t" \
258 "paddsw "XMMS", "TAN3" \n\t" \
259 "movdqa "REG0", "XMMS" \n\t" \
260 "psubsw %%xmm3, "REG0" \n\t" \
261 "paddsw "XMMS", %%xmm3 \n\t" \
262 MOV_32_ONLY" (%0), "TAN1" \n\t" \
263 "psraw $6, %%xmm5 \n\t" \
264 "psraw $6, "REG0" \n\t" \
265 "psraw $6, "TAN3" \n\t" \
266 "psraw $6, %%xmm3 \n\t" \
267 "movdqa "TAN3", 1*16("dct") \n\t" \
268 "movdqa %%xmm3, 2*16("dct") \n\t" \
269 "movdqa "REG0", 5*16("dct") \n\t" \
270 "movdqa %%xmm5, 6*16("dct") \n\t" \
271 "movdqa %%xmm7, %%xmm0 \n\t" \
272 "movdqa "REG4", %%xmm4 \n\t" \
273 "psubsw %%xmm1, %%xmm7 \n\t" \
274 "psubsw "TAN1", "REG4" \n\t" \
275 "paddsw %%xmm0, %%xmm1 \n\t" \
276 "paddsw %%xmm4, "TAN1" \n\t" \
277 "psraw $6, %%xmm1 \n\t" \
278 "psraw $6, %%xmm7 \n\t" \
279 "psraw $6, "TAN1" \n\t" \
280 "psraw $6, "REG4" \n\t" \
281 "movdqa %%xmm1, ("dct") \n\t" \
282 "movdqa "TAN1", 3*16("dct") \n\t" \
283 "movdqa "REG4", 4*16("dct") \n\t" \
284 "movdqa %%xmm7, 7*16("dct") \n\t"
285
286///IDCT pass on columns, assuming rows 4-7 are zero.
287#define iLLM_PASS_SPARSE(dct) \
288 "pmulhw %%xmm4, "TAN3" \n\t" \
289 "paddsw %%xmm4, "TAN3" \n\t" \
290 "movdqa %%xmm6, %%xmm3 \n\t" \
291 "pmulhw %%xmm6, "TAN1" \n\t" \
292 "movdqa %%xmm4, %%xmm1 \n\t" \
293 "psubsw %%xmm1, %%xmm3 \n\t" \
294 "paddsw %%xmm6, %%xmm1 \n\t" \
295 "movdqa "TAN1", %%xmm6 \n\t" \
296 "psubsw "TAN3", "TAN1" \n\t" \
297 "paddsw %%xmm6, "TAN3" \n\t" \
298 "movdqa %%xmm3, %%xmm6 \n\t" \
299 "psubsw "TAN3", %%xmm3 \n\t" \
300 "paddsw %%xmm6, "TAN3" \n\t" \
301 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
302 "pmulhw %%xmm4, %%xmm3 \n\t" \
303 "pmulhw %%xmm4, "TAN3" \n\t" \
304 "paddsw "TAN3", "TAN3" \n\t" \
305 "paddsw %%xmm3, %%xmm3 \n\t" \
306 "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
307 MOV_32_ONLY ROW2", "SREG2" \n\t" \
308 "pmulhw "SREG2", %%xmm5 \n\t" \
309 MOV_32_ONLY ROW0", "REG0" \n\t" \
310 "movdqa "REG0", %%xmm6 \n\t" \
311 "psubsw "SREG2", %%xmm6 \n\t" \
312 "paddsw "REG0", "SREG2" \n\t" \
313 MOV_32_ONLY" "TAN1", (%0) \n\t" \
314 "movdqa "REG0", "XMMS" \n\t" \
315 "psubsw %%xmm5, "REG0" \n\t" \
316 "paddsw "XMMS", %%xmm5 \n\t" \
317 "movdqa %%xmm5, "XMMS" \n\t" \
318 "psubsw "TAN3", %%xmm5 \n\t" \
319 "paddsw "XMMS", "TAN3" \n\t" \
320 "movdqa "REG0", "XMMS" \n\t" \
321 "psubsw %%xmm3, "REG0" \n\t" \
322 "paddsw "XMMS", %%xmm3 \n\t" \
323 MOV_32_ONLY" (%0), "TAN1" \n\t" \
324 "psraw $6, %%xmm5 \n\t" \
325 "psraw $6, "REG0" \n\t" \
326 "psraw $6, "TAN3" \n\t" \
327 "psraw $6, %%xmm3 \n\t" \
328 "movdqa "TAN3", 1*16("dct") \n\t" \
329 "movdqa %%xmm3, 2*16("dct") \n\t" \
330 "movdqa "REG0", 5*16("dct") \n\t" \
331 "movdqa %%xmm5, 6*16("dct") \n\t" \
332 "movdqa "SREG2", %%xmm0 \n\t" \
333 "movdqa %%xmm6, %%xmm4 \n\t" \
334 "psubsw %%xmm1, "SREG2" \n\t" \
335 "psubsw "TAN1", %%xmm6 \n\t" \
336 "paddsw %%xmm0, %%xmm1 \n\t" \
337 "paddsw %%xmm4, "TAN1" \n\t" \
338 "psraw $6, %%xmm1 \n\t" \
339 "psraw $6, "SREG2" \n\t" \
340 "psraw $6, "TAN1" \n\t" \
341 "psraw $6, %%xmm6 \n\t" \
342 "movdqa %%xmm1, ("dct") \n\t" \
343 "movdqa "TAN1", 3*16("dct") \n\t" \
344 "movdqa %%xmm6, 4*16("dct") \n\t" \
345 "movdqa "SREG2", 7*16("dct") \n\t"
346
347inline void ff_idct_xvid_sse2(short *block)
348{
be449fca 349 __asm__ volatile(
f73a6393
AS
350 "movq "MANGLE(m127)", %%mm0 \n\t"
351 iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
352 iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
353 iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
354
355 TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
356 JZ("%%eax", "1f")
357 iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
358
359 TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
360 TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
361 iLLM_HEAD
ef4a6514 362 ".p2align 4 \n\t"
f73a6393
AS
363 JNZ("%%ecx", "2f")
364 JNZ("%%eax", "3f")
365 JNZ("%%edx", "4f")
366 JNZ("%%esi", "5f")
367 iLLM_PASS_SPARSE("%0")
368 "jmp 6f \n\t"
369 "2: \n\t"
370 iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
371 "3: \n\t"
372 iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
373 JZ("%%edx", "1f")
374 "4: \n\t"
375 iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
376 JZ("%%esi", "1f")
377 "5: \n\t"
378 iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
c59211b4 379#if ARCH_X86_32
f73a6393
AS
380 iLLM_HEAD
381#endif
382 iLLM_PASS("%0")
383 "6: \n\t"
384 : "+r"(block)
385 :
153ca56b
RP
386 : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
387 "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
ba404520 388#if ARCH_X86_64
153ca56b
RP
389 XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
390 "%xmm12", "%xmm13", "%xmm14",)
ba404520 391#endif
153ca56b 392 "%eax", "%ecx", "%edx", "%esi", "memory"
616735eb 393 );
f73a6393
AS
394}
395
396void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
397{
398 ff_idct_xvid_sse2(block);
7e7c4b60 399 ff_put_pixels_clamped_mmx(block, dest, line_size);
f73a6393
AS
400}
401
402void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
403{
404 ff_idct_xvid_sse2(block);
7e7c4b60 405 ff_add_pixels_clamped_mmx(block, dest, line_size);
f73a6393 406}
79195ce5 407
0b8b2ae5 408#endif /* HAVE_SSE2_INLINE */