Use full path for #includes from another directory.
[libav.git] / libavcodec / i386 / simple_idct_mmx.c
CommitLineData
37e8dcda 1/*
ff4ec49e
FB
2 * Simple IDCT MMX
3 *
4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5 *
b78e7197
DB
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
ff4ec49e
FB
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
b78e7197 11 * version 2.1 of the License, or (at your option) any later version.
ff4ec49e 12 *
b78e7197 13 * FFmpeg is distributed in the hope that it will be useful,
ff4ec49e
FB
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
b78e7197 19 * License along with FFmpeg; if not, write to the Free Software
5509bffa 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
ff4ec49e 21 */
245976da
DB
22#include "libavcodec/dsputil.h"
23#include "libavcodec/simple_idct.h"
ff4ec49e 24
9e1795dd
MN
25/*
2623170.475006
2722725.260826
2821406.727617
2919265.545870
3016384.000000
3112872.826198
328866.956905
334520.335430
34*/
37e8dcda
A
35#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
38#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
9e1795dd 39#if 0
37e8dcda 40#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
9e1795dd
MN
41#else
42#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
43#endif
37e8dcda 44#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
bb270c08
DB
45#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37e8dcda
A
47
48#define ROW_SHIFT 11
49#define COL_SHIFT 20 // 6
50
766324fc
RD
51DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
52DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
41338ac0 53
038f0f9b 54DECLARE_ALIGNED(8, static const int16_t, coeffs[])= {
bb270c08
DB
55 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
56// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
57// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
58 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
59 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
60// 0, 0, 0, 0,
61// 0, 0, 0, 0,
37e8dcda 62
0a8d8945
MN
63 C4, C4, C4, C4,
64 C4, -C4, C4, -C4,
115329f1 65
0a8d8945
MN
66 C2, C6, C2, C6,
67 C6, -C2, C6, -C2,
115329f1 68
0a8d8945
MN
69 C1, C3, C1, C3,
70 C5, C7, C5, C7,
115329f1 71
0a8d8945
MN
72 C3, -C7, C3, -C7,
73-C1, -C5, -C1, -C5,
115329f1 74
0a8d8945
MN
75 C5, -C1, C5, -C1,
76 C7, C3, C7, C3,
115329f1 77
0a8d8945
MN
78 C7, -C5, C7, -C5,
79 C3, -C1, C3, -C1
80};
81
ef5b1b5a 82#if 0
0a8d8945 83static void unused_var_killer(){
bb270c08
DB
84 int a= wm1010 + d40000;
85 temp[0]=a;
0a8d8945
MN
86}
87
37e8dcda
A
88static void inline idctCol (int16_t * col, int16_t *input)
89{
90#undef C0
91#undef C1
92#undef C2
93#undef C3
94#undef C4
95#undef C5
96#undef C6
97#undef C7
bb270c08
DB
98 int a0, a1, a2, a3, b0, b1, b2, b3;
99 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
101 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
102 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
106 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37e8dcda 107/*
bb270c08
DB
108 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
109 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
110 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
111 return;
112 }*/
37e8dcda
A
113
114col[8*0] = input[8*0 + 0];
115col[8*1] = input[8*2 + 0];
116col[8*2] = input[8*0 + 1];
117col[8*3] = input[8*2 + 1];
118col[8*4] = input[8*4 + 0];
119col[8*5] = input[8*6 + 0];
120col[8*6] = input[8*4 + 1];
121col[8*7] = input[8*6 + 1];
122
bb270c08
DB
123 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
124 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
125 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
126 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
127
128 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
129 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
130 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
131 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
132
133 col[8*0] = (a0 + b0) >> COL_SHIFT;
134 col[8*1] = (a1 + b1) >> COL_SHIFT;
135 col[8*2] = (a2 + b2) >> COL_SHIFT;
136 col[8*3] = (a3 + b3) >> COL_SHIFT;
137 col[8*4] = (a3 - b3) >> COL_SHIFT;
138 col[8*5] = (a2 - b2) >> COL_SHIFT;
139 col[8*6] = (a1 - b1) >> COL_SHIFT;
140 col[8*7] = (a0 - b0) >> COL_SHIFT;
37e8dcda
A
141}
142
143static void inline idctRow (int16_t * output, int16_t * input)
144{
bb270c08
DB
145 int16_t row[8];
146
147 int a0, a1, a2, a3, b0, b1, b2, b3;
148 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
150 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
151 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
155 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37e8dcda
A
156
157row[0] = input[0];
158row[2] = input[1];
159row[4] = input[4];
160row[6] = input[5];
161row[1] = input[8];
162row[3] = input[9];
163row[5] = input[12];
164row[7] = input[13];
165
bb270c08
DB
166 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
167 row[0] = row[1] = row[2] = row[3] = row[4] =
168 row[5] = row[6] = row[7] = row[0]<<3;
169 output[0] = row[0];
170 output[2] = row[1];
171 output[4] = row[2];
172 output[6] = row[3];
173 output[8] = row[4];
174 output[10] = row[5];
175 output[12] = row[6];
176 output[14] = row[7];
177 return;
178 }
179
180 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
181 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
182 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
183 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
184
185 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
186 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
187 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
188 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
189
190 row[0] = (a0 + b0) >> ROW_SHIFT;
191 row[1] = (a1 + b1) >> ROW_SHIFT;
192 row[2] = (a2 + b2) >> ROW_SHIFT;
193 row[3] = (a3 + b3) >> ROW_SHIFT;
194 row[4] = (a3 - b3) >> ROW_SHIFT;
195 row[5] = (a2 - b2) >> ROW_SHIFT;
196 row[6] = (a1 - b1) >> ROW_SHIFT;
197 row[7] = (a0 - b0) >> ROW_SHIFT;
198
199 output[0] = row[0];
200 output[2] = row[1];
201 output[4] = row[2];
202 output[6] = row[3];
203 output[8] = row[4];
204 output[10] = row[5];
205 output[12] = row[6];
206 output[14] = row[7];
37e8dcda
A
207}
208#endif
209
210static inline void idct(int16_t *block)
211{
27215c6b 212 DECLARE_ALIGNED(8, int64_t, align_tmp[16]);
bb270c08 213 int16_t * const temp= (int16_t*)align_tmp;
41338ac0 214
bb270c08 215 asm volatile(
37e8dcda 216#if 0 //Alternative, simpler variant
0a8d8945
MN
217
218#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
bb270c08
DB
219 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
220 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
221 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
222 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
223 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
224 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
225 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
226 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
227 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
228 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
229 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
230 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
231 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
232 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
233 #rounder ", %%mm4 \n\t"\
234 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
235 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
236 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
237 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
238 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
239 #rounder ", %%mm0 \n\t"\
240 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
241 "paddd %%mm0, %%mm0 \n\t" \
242 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
243 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
244 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
245 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
246 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
247 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
248 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
249 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
250 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
251 "psrad $" #shift ", %%mm7 \n\t"\
252 "psrad $" #shift ", %%mm4 \n\t"\
253 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
254 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
255 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
256 "psrad $" #shift ", %%mm1 \n\t"\
257 "psrad $" #shift ", %%mm2 \n\t"\
258 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
259 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
260 "movq %%mm7, " #dst " \n\t"\
261 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
262 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
263 "movq %%mm2, 24+" #dst " \n\t"\
264 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
265 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
266 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
267 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
268 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
269 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
270 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
271 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
272 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
273 "psrad $" #shift ", %%mm2 \n\t"\
274 "psrad $" #shift ", %%mm0 \n\t"\
275 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
276 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
277 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
278 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
279 "psrad $" #shift ", %%mm6 \n\t"\
280 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
281 "movq %%mm2, 8+" #dst " \n\t"\
282 "psrad $" #shift ", %%mm4 \n\t"\
283 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
284 "movq %%mm4, 16+" #dst " \n\t"\
0a8d8945 285
347be472 286#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
bb270c08
DB
287 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
288 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
289 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
290 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
291 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
292 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
293 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
294 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
295 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
296 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
297 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
298 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
bb270c08
DB
299 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
300 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
bb270c08
DB
301 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
302 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
303 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
304 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
305 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
306 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
307 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
308 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
309 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
310 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
311 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
312 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
313 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
314 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
315 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
316 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
317 "psrad $" #shift ", %%mm7 \n\t"\
318 "psrad $" #shift ", %%mm4 \n\t"\
319 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
320 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
321 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
322 "psrad $" #shift ", %%mm0 \n\t"\
323 "psrad $" #shift ", %%mm2 \n\t"\
324 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
325 "movd %%mm7, " #dst " \n\t"\
326 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
327 "movd %%mm0, 16+" #dst " \n\t"\
328 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
329 "movd %%mm2, 96+" #dst " \n\t"\
330 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
331 "movd %%mm4, 112+" #dst " \n\t"\
332 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
333 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
334 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
335 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
336 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
337 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
338 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
339 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
340 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
341 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
342 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
343 "psrad $" #shift ", %%mm2 \n\t"\
344 "psrad $" #shift ", %%mm5 \n\t"\
345 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
346 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
347 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
348 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
349 "psrad $" #shift ", %%mm6 \n\t"\
350 "psrad $" #shift ", %%mm4 \n\t"\
351 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
352 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
353 "movd %%mm2, 32+" #dst " \n\t"\
354 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
355 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
356 "movd %%mm6, 48+" #dst " \n\t"\
357 "movd %%mm4, 64+" #dst " \n\t"\
358 "movd %%mm5, 80+" #dst " \n\t"\
0a8d8945 359
115329f1 360
0a8d8945 361#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
bb270c08
DB
362 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
363 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
364 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
365 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
366 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
367 "pand %%mm0, %%mm4 \n\t"\
368 "por %%mm1, %%mm4 \n\t"\
369 "por %%mm2, %%mm4 \n\t"\
370 "por %%mm3, %%mm4 \n\t"\
371 "packssdw %%mm4,%%mm4 \n\t"\
372 "movd %%mm4, %%eax \n\t"\
373 "orl %%eax, %%eax \n\t"\
374 "jz 1f \n\t"\
375 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
376 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
377 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
378 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
379 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
380 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
381 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
382 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
383 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
384 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
385 #rounder ", %%mm4 \n\t"\
386 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
387 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
388 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
389 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
390 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
391 #rounder ", %%mm0 \n\t"\
392 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
393 "paddd %%mm0, %%mm0 \n\t" \
394 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
395 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
396 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
397 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
398 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
399 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
400 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
401 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
402 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
403 "psrad $" #shift ", %%mm7 \n\t"\
404 "psrad $" #shift ", %%mm4 \n\t"\
405 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
406 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
407 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
408 "psrad $" #shift ", %%mm1 \n\t"\
409 "psrad $" #shift ", %%mm2 \n\t"\
410 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
411 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
412 "movq %%mm7, " #dst " \n\t"\
413 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
414 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
415 "movq %%mm2, 24+" #dst " \n\t"\
416 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
417 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
418 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
419 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
420 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
421 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
422 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
423 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
424 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
425 "psrad $" #shift ", %%mm2 \n\t"\
426 "psrad $" #shift ", %%mm0 \n\t"\
427 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
428 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
429 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
430 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
431 "psrad $" #shift ", %%mm6 \n\t"\
432 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
433 "movq %%mm2, 8+" #dst " \n\t"\
434 "psrad $" #shift ", %%mm4 \n\t"\
435 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
436 "movq %%mm4, 16+" #dst " \n\t"\
437 "jmp 2f \n\t"\
438 "1: \n\t"\
439 "pslld $16, %%mm0 \n\t"\
440 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
441 "psrad $13, %%mm0 \n\t"\
442 "packssdw %%mm0, %%mm0 \n\t"\
443 "movq %%mm0, " #dst " \n\t"\
444 "movq %%mm0, 8+" #dst " \n\t"\
445 "movq %%mm0, 16+" #dst " \n\t"\
446 "movq %%mm0, 24+" #dst " \n\t"\
447 "2: \n\t"
37e8dcda 448
37e8dcda 449
0a8d8945
MN
450//IDCT( src0, src4, src1, src5, dst, rounder, shift)
451ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
452/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
453ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
454ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
455
456DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
457DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
458DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
459
460
347be472
JD
461//IDCT( src0, src4, src1, src5, dst, shift)
462COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
463COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
464COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
465COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
37e8dcda 466
0a8d8945
MN
467#else
468
469#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
bb270c08
DB
470 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
471 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
472 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
473 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
474 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
475 "pand %%mm0, %%mm4 \n\t"\
476 "por %%mm1, %%mm4 \n\t"\
477 "por %%mm2, %%mm4 \n\t"\
478 "por %%mm3, %%mm4 \n\t"\
479 "packssdw %%mm4,%%mm4 \n\t"\
480 "movd %%mm4, %%eax \n\t"\
481 "orl %%eax, %%eax \n\t"\
482 "jz 1f \n\t"\
483 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
484 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
485 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
486 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
487 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
488 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
489 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
490 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
491 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
492 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
493 #rounder ", %%mm4 \n\t"\
494 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
495 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
496 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
497 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
498 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
499 #rounder ", %%mm0 \n\t"\
500 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
501 "paddd %%mm0, %%mm0 \n\t" \
502 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
503 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
504 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
505 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
506 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
507 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
508 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
509 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
510 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
511 "psrad $" #shift ", %%mm7 \n\t"\
512 "psrad $" #shift ", %%mm4 \n\t"\
513 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
514 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
515 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
516 "psrad $" #shift ", %%mm1 \n\t"\
517 "psrad $" #shift ", %%mm2 \n\t"\
518 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
519 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
520 "movq %%mm7, " #dst " \n\t"\
521 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
522 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
523 "movq %%mm2, 24+" #dst " \n\t"\
524 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
525 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
526 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
527 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
528 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
529 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
530 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
531 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
532 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
533 "psrad $" #shift ", %%mm2 \n\t"\
534 "psrad $" #shift ", %%mm0 \n\t"\
535 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
536 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
537 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
538 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
539 "psrad $" #shift ", %%mm6 \n\t"\
540 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
541 "movq %%mm2, 8+" #dst " \n\t"\
542 "psrad $" #shift ", %%mm4 \n\t"\
543 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
544 "movq %%mm4, 16+" #dst " \n\t"\
545 "jmp 2f \n\t"\
546 "1: \n\t"\
547 "pslld $16, %%mm0 \n\t"\
548 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
549 "psrad $13, %%mm0 \n\t"\
550 "packssdw %%mm0, %%mm0 \n\t"\
551 "movq %%mm0, " #dst " \n\t"\
552 "movq %%mm0, 8+" #dst " \n\t"\
553 "movq %%mm0, 16+" #dst " \n\t"\
554 "movq %%mm0, 24+" #dst " \n\t"\
555 "2: \n\t"
37e8dcda 556
0a8d8945 557#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
bb270c08
DB
558 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
559 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
560 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
561 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
562 "movq %%mm0, %%mm4 \n\t"\
563 "por %%mm1, %%mm4 \n\t"\
564 "por %%mm2, %%mm4 \n\t"\
565 "por %%mm3, %%mm4 \n\t"\
566 "packssdw %%mm4,%%mm4 \n\t"\
567 "movd %%mm4, %%eax \n\t"\
568 "orl %%eax, %%eax \n\t"\
569 "jz " #bt " \n\t"\
570 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
571 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
572 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
573 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
574 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
575 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
576 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
577 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
578 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
579 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
580 #rounder ", %%mm4 \n\t"\
581 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
582 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
583 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
584 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
585 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
586 #rounder ", %%mm0 \n\t"\
587 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
588 "paddd %%mm0, %%mm0 \n\t" \
589 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
590 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
591 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
592 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
593 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
594 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
595 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
596 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
597 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
598 "psrad $" #shift ", %%mm7 \n\t"\
599 "psrad $" #shift ", %%mm4 \n\t"\
600 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
601 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
602 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
603 "psrad $" #shift ", %%mm1 \n\t"\
604 "psrad $" #shift ", %%mm2 \n\t"\
605 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
606 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
607 "movq %%mm7, " #dst " \n\t"\
608 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
609 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
610 "movq %%mm2, 24+" #dst " \n\t"\
611 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
612 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
613 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
614 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
615 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
616 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
617 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
618 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
619 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
620 "psrad $" #shift ", %%mm2 \n\t"\
621 "psrad $" #shift ", %%mm0 \n\t"\
622 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
623 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
624 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
625 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
626 "psrad $" #shift ", %%mm6 \n\t"\
627 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
628 "movq %%mm2, 8+" #dst " \n\t"\
629 "psrad $" #shift ", %%mm4 \n\t"\
630 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
631 "movq %%mm4, 16+" #dst " \n\t"\
0a8d8945
MN
632
633#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
bb270c08
DB
634 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
635 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
636 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
637 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
638 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
639 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
640 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
641 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
642 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
643 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
644 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
645 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
646 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
647 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
648 #rounder ", %%mm4 \n\t"\
649 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
650 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
651 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
652 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
653 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
654 #rounder ", %%mm0 \n\t"\
655 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
656 "paddd %%mm0, %%mm0 \n\t" \
657 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
658 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
659 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
660 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
661 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
662 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
663 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
664 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
665 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
666 "psrad $" #shift ", %%mm7 \n\t"\
667 "psrad $" #shift ", %%mm4 \n\t"\
668 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
669 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
670 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
671 "psrad $" #shift ", %%mm1 \n\t"\
672 "psrad $" #shift ", %%mm2 \n\t"\
673 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
674 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
675 "movq %%mm7, " #dst " \n\t"\
676 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
677 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
678 "movq %%mm2, 24+" #dst " \n\t"\
679 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
680 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
681 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
682 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
683 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
684 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
685 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
686 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
687 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
688 "psrad $" #shift ", %%mm2 \n\t"\
689 "psrad $" #shift ", %%mm0 \n\t"\
690 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
691 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
692 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
693 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
694 "psrad $" #shift ", %%mm6 \n\t"\
695 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
696 "movq %%mm2, 8+" #dst " \n\t"\
697 "psrad $" #shift ", %%mm4 \n\t"\
698 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
699 "movq %%mm4, 16+" #dst " \n\t"\
0a8d8945
MN
700
701//IDCT( src0, src4, src1, src5, dst, rounder, shift)
702DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
703Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
704Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
705Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
706
707#undef IDCT
347be472 708#define IDCT(src0, src4, src1, src5, dst, shift) \
bb270c08
DB
709 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
710 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
711 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
712 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
713 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
714 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
715 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
716 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
717 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
718 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
719 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
720 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
bb270c08
DB
721 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
722 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
bb270c08
DB
723 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
724 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
725 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
726 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
727 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
728 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
729 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
730 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
731 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
732 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
733 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
734 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
735 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
736 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
737 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
738 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
739 "psrad $" #shift ", %%mm7 \n\t"\
740 "psrad $" #shift ", %%mm4 \n\t"\
741 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
742 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
743 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
744 "psrad $" #shift ", %%mm0 \n\t"\
745 "psrad $" #shift ", %%mm2 \n\t"\
746 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
747 "movd %%mm7, " #dst " \n\t"\
748 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
749 "movd %%mm0, 16+" #dst " \n\t"\
750 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
751 "movd %%mm2, 96+" #dst " \n\t"\
752 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
753 "movd %%mm4, 112+" #dst " \n\t"\
754 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
755 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
756 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
757 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
758 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
759 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
760 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
761 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
762 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
763 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
764 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
765 "psrad $" #shift ", %%mm2 \n\t"\
766 "psrad $" #shift ", %%mm5 \n\t"\
767 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
768 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
769 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
770 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
771 "psrad $" #shift ", %%mm6 \n\t"\
772 "psrad $" #shift ", %%mm4 \n\t"\
773 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
774 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
775 "movd %%mm2, 32+" #dst " \n\t"\
776 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
777 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
778 "movd %%mm6, 48+" #dst " \n\t"\
779 "movd %%mm4, 64+" #dst " \n\t"\
780 "movd %%mm5, 80+" #dst " \n\t"
0a8d8945
MN
781
782
347be472
JD
783//IDCT( src0, src4, src1, src5, dst, shift)
784IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
785IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
786IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
787IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
bb270c08 788 "jmp 9f \n\t"
37e8dcda 789
4454dc1b 790 "#" ASMALIGN(4) \
bb270c08 791 "4: \n\t"
0a8d8945
MN
792Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
793Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
37e8dcda 794
0a8d8945 795#undef IDCT
347be472 796#define IDCT(src0, src4, src1, src5, dst, shift) \
bb270c08
DB
797 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
798 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
799 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
800 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
801 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
802 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
803 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
804 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
805 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
806 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
807 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
bb270c08 808 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
bb270c08
DB
809 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
810 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
811 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
812 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
813 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
814 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
815 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
816 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
817 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
818 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
819 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
820 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
821 "psrad $" #shift ", %%mm1 \n\t"\
822 "psrad $" #shift ", %%mm4 \n\t"\
823 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
824 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
825 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
826 "psrad $" #shift ", %%mm0 \n\t"\
827 "psrad $" #shift ", %%mm2 \n\t"\
828 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
829 "movd %%mm1, " #dst " \n\t"\
830 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
831 "movd %%mm0, 16+" #dst " \n\t"\
832 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
833 "movd %%mm2, 96+" #dst " \n\t"\
834 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
835 "movd %%mm4, 112+" #dst " \n\t"\
836 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
837 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
838 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
839 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
840 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
841 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
842 "psrad $" #shift ", %%mm2 \n\t"\
843 "psrad $" #shift ", %%mm5 \n\t"\
844 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
845 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
846 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
847 "psrad $" #shift ", %%mm6 \n\t"\
848 "psrad $" #shift ", %%mm1 \n\t"\
849 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
850 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
851 "movd %%mm2, 32+" #dst " \n\t"\
852 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
853 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
854 "movd %%mm6, 48+" #dst " \n\t"\
855 "movd %%mm1, 64+" #dst " \n\t"\
856 "movd %%mm5, 80+" #dst " \n\t"
0a8d8945 857
347be472
JD
858//IDCT( src0, src4, src1, src5, dst, shift)
859IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
860IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
861IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
862IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
bb270c08 863 "jmp 9f \n\t"
37e8dcda 864
4454dc1b 865 "#" ASMALIGN(4) \
bb270c08 866 "6: \n\t"
0a8d8945 867Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
37e8dcda 868
0a8d8945 869#undef IDCT
347be472 870#define IDCT(src0, src4, src1, src5, dst, shift) \
bb270c08
DB
871 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
872 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
873 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
874 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
875 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
876 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
bb270c08 877 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
bb270c08
DB
878 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
879 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
880 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
881 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
882 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
883 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
884 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
885 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
886 "psrad $" #shift ", %%mm1 \n\t"\
887 "psrad $" #shift ", %%mm4 \n\t"\
888 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
889 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
890 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
891 "psrad $" #shift ", %%mm0 \n\t"\
892 "psrad $" #shift ", %%mm2 \n\t"\
893 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
894 "movd %%mm1, " #dst " \n\t"\
895 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
896 "movd %%mm0, 16+" #dst " \n\t"\
897 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
898 "movd %%mm2, 96+" #dst " \n\t"\
899 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
900 "movd %%mm4, 112+" #dst " \n\t"\
901 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
902 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
903 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
904 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
905 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
906 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
907 "psrad $" #shift ", %%mm2 \n\t"\
908 "psrad $" #shift ", %%mm5 \n\t"\
909 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
910 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
911 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
912 "psrad $" #shift ", %%mm6 \n\t"\
913 "psrad $" #shift ", %%mm1 \n\t"\
914 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
915 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
916 "movd %%mm2, 32+" #dst " \n\t"\
917 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
918 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
919 "movd %%mm6, 48+" #dst " \n\t"\
920 "movd %%mm1, 64+" #dst " \n\t"\
921 "movd %%mm5, 80+" #dst " \n\t"
0a8d8945
MN
922
923
347be472
JD
924//IDCT( src0, src4, src1, src5, dst, shift)
925IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
926IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
927IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
928IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
bb270c08 929 "jmp 9f \n\t"
37e8dcda 930
4454dc1b 931 "#" ASMALIGN(4) \
bb270c08 932 "2: \n\t"
0a8d8945 933Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
37e8dcda 934
0a8d8945 935#undef IDCT
347be472 936#define IDCT(src0, src4, src1, src5, dst, shift) \
bb270c08
DB
937 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
938 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
939 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
940 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
941 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
942 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
943 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
bb270c08
DB
944 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
945 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
bb270c08
DB
946 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
947 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
948 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
949 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
950 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
951 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
952 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
953 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
954 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
955 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
956 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
957 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
958 "psrad $" #shift ", %%mm7 \n\t"\
959 "psrad $" #shift ", %%mm4 \n\t"\
960 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
961 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
962 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
963 "psrad $" #shift ", %%mm0 \n\t"\
964 "psrad $" #shift ", %%mm2 \n\t"\
965 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
966 "movd %%mm7, " #dst " \n\t"\
967 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
968 "movd %%mm0, 16+" #dst " \n\t"\
969 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
970 "movd %%mm2, 96+" #dst " \n\t"\
971 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
972 "movd %%mm4, 112+" #dst " \n\t"\
973 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
974 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
975 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
976 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
977 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
978 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
979 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
980 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
981 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
982 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
983 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
984 "psrad $" #shift ", %%mm2 \n\t"\
985 "psrad $" #shift ", %%mm5 \n\t"\
986 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
987 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
988 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
989 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
990 "psrad $" #shift ", %%mm6 \n\t"\
991 "psrad $" #shift ", %%mm4 \n\t"\
992 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
993 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
994 "movd %%mm2, 32+" #dst " \n\t"\
995 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
996 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
997 "movd %%mm6, 48+" #dst " \n\t"\
998 "movd %%mm4, 64+" #dst " \n\t"\
999 "movd %%mm5, 80+" #dst " \n\t"
0a8d8945 1000
347be472
JD
1001//IDCT( src0, src4, src1, src5, dst, shift)
1002IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1003IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1004IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1005IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
bb270c08 1006 "jmp 9f \n\t"
37e8dcda 1007
4454dc1b 1008 "#" ASMALIGN(4) \
bb270c08 1009 "3: \n\t"
0a8d8945 1010#undef IDCT
347be472 1011#define IDCT(src0, src4, src1, src5, dst, shift) \
bb270c08
DB
1012 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1013 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1014 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1015 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1016 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1017 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
bb270c08
DB
1018 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1019 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
bb270c08
DB
1020 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1021 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1022 "movq 64(%2), %%mm3 \n\t"\
1023 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1024 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1025 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1026 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1027 "psrad $" #shift ", %%mm7 \n\t"\
1028 "psrad $" #shift ", %%mm4 \n\t"\
1029 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
1030 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1031 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1032 "psrad $" #shift ", %%mm0 \n\t"\
1033 "psrad $" #shift ", %%mm1 \n\t"\
1034 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1035 "movd %%mm7, " #dst " \n\t"\
1036 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1037 "movd %%mm0, 16+" #dst " \n\t"\
1038 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1039 "movd %%mm1, 96+" #dst " \n\t"\
1040 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1041 "movd %%mm4, 112+" #dst " \n\t"\
1042 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1043 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1044 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1045 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
1046 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1047 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1048 "psrad $" #shift ", %%mm1 \n\t"\
1049 "psrad $" #shift ", %%mm5 \n\t"\
1050 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1051 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1052 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1053 "psrad $" #shift ", %%mm6 \n\t"\
1054 "psrad $" #shift ", %%mm4 \n\t"\
1055 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1056 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1057 "movd %%mm1, 32+" #dst " \n\t"\
1058 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1059 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1060 "movd %%mm6, 48+" #dst " \n\t"\
1061 "movd %%mm4, 64+" #dst " \n\t"\
1062 "movd %%mm5, 80+" #dst " \n\t"
0a8d8945
MN
1063
1064
347be472
JD
1065//IDCT( src0, src4, src1, src5, dst, shift)
1066IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1067IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1068IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1069IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
bb270c08 1070 "jmp 9f \n\t"
37e8dcda 1071
4454dc1b 1072 "#" ASMALIGN(4) \
bb270c08 1073 "5: \n\t"
0a8d8945 1074#undef IDCT
347be472 1075#define IDCT(src0, src4, src1, src5, dst, shift) \
bb270c08
DB
1076 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1077 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1078 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1079 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1080 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1081 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1082 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1083 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1084 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1085 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
bb270c08
DB
1086 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1087 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
bb270c08
DB
1088 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1089 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1090 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1091 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1092 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1093 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
1094 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1095 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1096 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1097 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1098 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1099 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1100 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
bb270c08
DB
1101 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
1102 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
bb270c08
DB
1103 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
1104 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
1105 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
1106 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
1107 "psrad $" #shift ", %%mm4 \n\t"\
1108 "psrad $" #shift ", %%mm7 \n\t"\
1109 "psrad $" #shift ", %%mm3 \n\t"\
1110 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
1111 "movq %%mm4, " #dst " \n\t"\
1112 "psrad $" #shift ", %%mm0 \n\t"\
1113 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
1114 "movq %%mm0, 16+" #dst " \n\t"\
1115 "movq %%mm0, 96+" #dst " \n\t"\
1116 "movq %%mm4, 112+" #dst " \n\t"\
1117 "psrad $" #shift ", %%mm5 \n\t"\
1118 "psrad $" #shift ", %%mm6 \n\t"\
1119 "psrad $" #shift ", %%mm2 \n\t"\
1120 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1121 "movq %%mm5, 32+" #dst " \n\t"\
1122 "psrad $" #shift ", %%mm1 \n\t"\
1123 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1124 "movq %%mm6, 48+" #dst " \n\t"\
1125 "movq %%mm6, 64+" #dst " \n\t"\
1126 "movq %%mm5, 80+" #dst " \n\t"
115329f1 1127
0a8d8945 1128
347be472
JD
1129//IDCT( src0, src4, src1, src5, dst, shift)
1130IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1131//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1132IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1133//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
bb270c08 1134 "jmp 9f \n\t"
37e8dcda
A
1135
1136
4454dc1b 1137 "#" ASMALIGN(4) \
bb270c08 1138 "1: \n\t"
0a8d8945 1139#undef IDCT
347be472 1140#define IDCT(src0, src4, src1, src5, dst, shift) \
bb270c08
DB
1141 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1142 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1143 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1144 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1145 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1146 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1147 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1148 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1149 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1150 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1151 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
bb270c08
DB
1152 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1153 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
bb270c08
DB
1154 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1155 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1156 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1157 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1158 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1159 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1160 "movq 64(%2), %%mm1 \n\t"\
1161 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1162 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1163 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1164 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1165 "psrad $" #shift ", %%mm7 \n\t"\
1166 "psrad $" #shift ", %%mm4 \n\t"\
1167 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1168 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1169 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1170 "psrad $" #shift ", %%mm0 \n\t"\
1171 "psrad $" #shift ", %%mm3 \n\t"\
1172 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1173 "movd %%mm7, " #dst " \n\t"\
1174 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1175 "movd %%mm0, 16+" #dst " \n\t"\
1176 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1177 "movd %%mm3, 96+" #dst " \n\t"\
1178 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1179 "movd %%mm4, 112+" #dst " \n\t"\
1180 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1181 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1182 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1183 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1184 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1185 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1186 "psrad $" #shift ", %%mm3 \n\t"\
1187 "psrad $" #shift ", %%mm5 \n\t"\
1188 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1189 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1190 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1191 "psrad $" #shift ", %%mm6 \n\t"\
1192 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1193 "movd %%mm3, 32+" #dst " \n\t"\
1194 "psrad $" #shift ", %%mm4 \n\t"\
1195 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1196 "movd %%mm6, 48+" #dst " \n\t"\
1197 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1198 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1199 "movd %%mm4, 64+" #dst " \n\t"\
1200 "movd %%mm5, 80+" #dst " \n\t"
115329f1 1201
0a8d8945 1202
347be472
JD
1203//IDCT( src0, src4, src1, src5, dst, shift)
1204IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1205IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1206IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1207IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
bb270c08 1208 "jmp 9f \n\t"
37e8dcda
A
1209
1210
4454dc1b 1211 "#" ASMALIGN(4)
bb270c08 1212 "7: \n\t"
0a8d8945 1213#undef IDCT
347be472 1214#define IDCT(src0, src4, src1, src5, dst, shift) \
bb270c08
DB
1215 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1216 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1217 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1218 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1219 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
bb270c08
DB
1220 "psrad $" #shift ", %%mm4 \n\t"\
1221 "psrad $" #shift ", %%mm0 \n\t"\
1222 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1223 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1224 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1225 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1226 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1227 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
bb270c08
DB
1228 "psrad $" #shift ", %%mm1 \n\t"\
1229 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1230 "movq %%mm4, " #dst " \n\t"\
1231 "psrad $" #shift ", %%mm2 \n\t"\
1232 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1233 "movq %%mm0, 16+" #dst " \n\t"\
1234 "movq %%mm0, 96+" #dst " \n\t"\
1235 "movq %%mm4, 112+" #dst " \n\t"\
1236 "movq %%mm0, 32+" #dst " \n\t"\
1237 "movq %%mm4, 48+" #dst " \n\t"\
1238 "movq %%mm4, 64+" #dst " \n\t"\
1239 "movq %%mm0, 80+" #dst " \n\t"
0a8d8945 1240
347be472
JD
1241//IDCT( src0, src4, src1, src5, dst, shift)
1242IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1243//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1244IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1245//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
37e8dcda
A
1246
1247
1248#endif
1249
1250/*
1251Input
0a8d8945
MN
1252 00 40 04 44 20 60 24 64
1253 10 30 14 34 50 70 54 74
1254 01 41 03 43 21 61 23 63
37e8dcda 1255 11 31 13 33 51 71 53 73
0a8d8945
MN
1256 02 42 06 46 22 62 26 66
1257 12 32 16 36 52 72 56 76
1258 05 45 07 47 25 65 27 67
1259 15 35 17 37 55 75 57 77
115329f1 1260
37e8dcda 1261Temp
0a8d8945
MN
1262 00 04 10 14 20 24 30 34
1263 40 44 50 54 60 64 70 74
37e8dcda
A
1264 01 03 11 13 21 23 31 33
1265 41 43 51 53 61 63 71 73
0a8d8945
MN
1266 02 06 12 16 22 26 32 36
1267 42 46 52 56 62 66 72 76
37e8dcda
A
1268 05 07 15 17 25 27 35 37
1269 45 47 55 57 65 67 75 77
1270*/
1271
37e8dcda 1272"9: \n\t"
bb270c08
DB
1273 :: "r" (block), "r" (temp), "r" (coeffs)
1274 : "%eax"
1275 );
37e8dcda
A
1276}
1277
2ad1516a 1278void ff_simple_idct_mmx(int16_t *block)
37e8dcda 1279{
2ad1516a
MN
1280 idct(block);
1281}
1282
1283//FIXME merge add/put into the idct
1284
0c1a9eda 1285void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
2ad1516a
MN
1286{
1287 idct(block);
ec7e0bf0 1288 put_pixels_clamped_mmx(block, dest, line_size);
2ad1516a 1289}
0c1a9eda 1290void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
2ad1516a
MN
1291{
1292 idct(block);
ec7e0bf0 1293 add_pixels_clamped_mmx(block, dest, line_size);
37e8dcda 1294}