Reportedly FFmpeg fails to compile on Cygwin with vhook enabled, but FFserver
[libav.git] / libavcodec / i386 / simple_idct_mmx.c
CommitLineData
37e8dcda 1/*
ff4ec49e
FB
2 * Simple IDCT MMX
3 *
4 * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
37e8dcda 20#include "../dsputil.h"
e96682e6 21#include "../simple_idct.h"
ff4ec49e 22
9e1795dd
MN
23/*
2423170.475006
2522725.260826
2621406.727617
2719265.545870
2816384.000000
2912872.826198
308866.956905
314520.335430
32*/
37e8dcda
A
33#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
9e1795dd 37#if 0
37e8dcda 38#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
9e1795dd
MN
39#else
40#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
41#endif
37e8dcda
A
42#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45
46#define ROW_SHIFT 11
47#define COL_SHIFT 20 // 6
48
5c0513bd
DB
49static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
50static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
41338ac0
MN
51
52static const int16_t __attribute__((aligned(8))) coeffs[]= {
37e8dcda
A
53 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
54// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
55// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
56 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
57 // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
58// 0, 0, 0, 0,
59// 0, 0, 0, 0,
60
0a8d8945
MN
61 C4, C4, C4, C4,
62 C4, -C4, C4, -C4,
115329f1 63
0a8d8945
MN
64 C2, C6, C2, C6,
65 C6, -C2, C6, -C2,
115329f1 66
0a8d8945
MN
67 C1, C3, C1, C3,
68 C5, C7, C5, C7,
115329f1 69
0a8d8945
MN
70 C3, -C7, C3, -C7,
71-C1, -C5, -C1, -C5,
115329f1 72
0a8d8945
MN
73 C5, -C1, C5, -C1,
74 C7, C3, C7, C3,
115329f1 75
0a8d8945
MN
76 C7, -C5, C7, -C5,
77 C3, -C1, C3, -C1
78};
79
ef5b1b5a 80#if 0
0a8d8945
MN
81static void unused_var_killer(){
82 int a= wm1010 + d40000;
83 temp[0]=a;
84}
85
37e8dcda
A
86static void inline idctCol (int16_t * col, int16_t *input)
87{
88#undef C0
89#undef C1
90#undef C2
91#undef C3
92#undef C4
93#undef C5
94#undef C6
95#undef C7
96 int a0, a1, a2, a3, b0, b1, b2, b3;
97 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
98 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
99 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
100 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
ccf589a8 101 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37e8dcda
A
102 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
103 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
104 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
105/*
106 if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
107 col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
108 col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
109 return;
110 }*/
111
112col[8*0] = input[8*0 + 0];
113col[8*1] = input[8*2 + 0];
114col[8*2] = input[8*0 + 1];
115col[8*3] = input[8*2 + 1];
116col[8*4] = input[8*4 + 0];
117col[8*5] = input[8*6 + 0];
118col[8*6] = input[8*4 + 1];
119col[8*7] = input[8*6 + 1];
120
121 a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
122 a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
123 a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
124 a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
125
126 b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
127 b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
128 b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
129 b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
130
131 col[8*0] = (a0 + b0) >> COL_SHIFT;
132 col[8*1] = (a1 + b1) >> COL_SHIFT;
133 col[8*2] = (a2 + b2) >> COL_SHIFT;
134 col[8*3] = (a3 + b3) >> COL_SHIFT;
135 col[8*4] = (a3 - b3) >> COL_SHIFT;
136 col[8*5] = (a2 - b2) >> COL_SHIFT;
137 col[8*6] = (a1 - b1) >> COL_SHIFT;
138 col[8*7] = (a0 - b0) >> COL_SHIFT;
139}
140
141static void inline idctRow (int16_t * output, int16_t * input)
142{
143 int16_t row[8];
144
145 int a0, a1, a2, a3, b0, b1, b2, b3;
146 const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
147 const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
148 const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
149 const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
ccf589a8 150 const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
37e8dcda
A
151 const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
152 const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
153 const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
154
155row[0] = input[0];
156row[2] = input[1];
157row[4] = input[4];
158row[6] = input[5];
159row[1] = input[8];
160row[3] = input[9];
161row[5] = input[12];
162row[7] = input[13];
163
164 if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
165 row[0] = row[1] = row[2] = row[3] = row[4] =
166 row[5] = row[6] = row[7] = row[0]<<3;
167 output[0] = row[0];
168 output[2] = row[1];
169 output[4] = row[2];
170 output[6] = row[3];
171 output[8] = row[4];
172 output[10] = row[5];
173 output[12] = row[6];
174 output[14] = row[7];
175 return;
176 }
177
178 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
179 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
180 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
181 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
182
183 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
184 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
185 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
186 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
187
188 row[0] = (a0 + b0) >> ROW_SHIFT;
189 row[1] = (a1 + b1) >> ROW_SHIFT;
190 row[2] = (a2 + b2) >> ROW_SHIFT;
191 row[3] = (a3 + b3) >> ROW_SHIFT;
192 row[4] = (a3 - b3) >> ROW_SHIFT;
193 row[5] = (a2 - b2) >> ROW_SHIFT;
194 row[6] = (a1 - b1) >> ROW_SHIFT;
195 row[7] = (a0 - b0) >> ROW_SHIFT;
196
197 output[0] = row[0];
198 output[2] = row[1];
199 output[4] = row[2];
200 output[6] = row[3];
201 output[8] = row[4];
202 output[10] = row[5];
203 output[12] = row[6];
204 output[14] = row[7];
205}
206#endif
207
208static inline void idct(int16_t *block)
209{
41338ac0
MN
210 int64_t __attribute__((aligned(8))) align_tmp[16];
211 int16_t * const temp= (int16_t*)align_tmp;
212
37e8dcda 213 asm volatile(
37e8dcda 214#if 0 //Alternative, simpler variant
0a8d8945
MN
215
216#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
217 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
218 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
37e8dcda
A
219 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
220 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
0a8d8945
MN
221 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
222 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
223 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
224 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
225 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
226 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
227 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
228 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
229 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
230 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
37e8dcda 231 #rounder ", %%mm4 \n\t"\
0a8d8945
MN
232 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
233 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
234 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
235 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
236 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
237 #rounder ", %%mm0 \n\t"\
238 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
239 "paddd %%mm0, %%mm0 \n\t" \
240 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
241 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
242 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
243 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
244 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
245 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
37e8dcda 246 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
0a8d8945
MN
247 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
248 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
249 "psrad $" #shift ", %%mm7 \n\t"\
250 "psrad $" #shift ", %%mm4 \n\t"\
251 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
252 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
253 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
254 "psrad $" #shift ", %%mm1 \n\t"\
255 "psrad $" #shift ", %%mm2 \n\t"\
256 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
257 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
258 "movq %%mm7, " #dst " \n\t"\
259 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
260 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
261 "movq %%mm2, 24+" #dst " \n\t"\
262 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
263 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
264 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
265 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
266 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
267 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
268 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
269 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
270 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
271 "psrad $" #shift ", %%mm2 \n\t"\
272 "psrad $" #shift ", %%mm0 \n\t"\
273 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
274 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
275 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
276 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
37e8dcda 277 "psrad $" #shift ", %%mm6 \n\t"\
0a8d8945
MN
278 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
279 "movq %%mm2, 8+" #dst " \n\t"\
37e8dcda 280 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
281 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
282 "movq %%mm4, 16+" #dst " \n\t"\
283
284#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
285 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
286 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
287 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
288 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
289 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
290 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
291 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
292 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
293 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
294 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
295 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
296 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
37e8dcda 297 #rounder ", %%mm4 \n\t"\
0a8d8945
MN
298 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
299 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
300 #rounder ", %%mm0 \n\t"\
301 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
302 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
303 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
304 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
305 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
306 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
307 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
308 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
309 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
310 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
311 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
312 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
313 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
314 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
315 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
316 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
317 "psrad $" #shift ", %%mm7 \n\t"\
37e8dcda 318 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
319 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
320 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
321 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
322 "psrad $" #shift ", %%mm0 \n\t"\
323 "psrad $" #shift ", %%mm2 \n\t"\
324 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
325 "movd %%mm7, " #dst " \n\t"\
326 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
327 "movd %%mm0, 16+" #dst " \n\t"\
328 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
329 "movd %%mm2, 96+" #dst " \n\t"\
330 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
331 "movd %%mm4, 112+" #dst " \n\t"\
332 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
333 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
334 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
335 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
336 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
37e8dcda 337 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
0a8d8945
MN
338 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
339 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
340 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
341 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
342 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
343 "psrad $" #shift ", %%mm2 \n\t"\
344 "psrad $" #shift ", %%mm5 \n\t"\
345 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
346 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
347 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
348 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
37e8dcda
A
349 "psrad $" #shift ", %%mm6 \n\t"\
350 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
351 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
352 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
353 "movd %%mm2, 32+" #dst " \n\t"\
354 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
355 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
356 "movd %%mm6, 48+" #dst " \n\t"\
357 "movd %%mm4, 64+" #dst " \n\t"\
358 "movd %%mm5, 80+" #dst " \n\t"\
359
115329f1 360
0a8d8945
MN
361#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
362 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
363 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
37e8dcda
A
364 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
365 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
4bdd9157 366 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
37e8dcda
A
367 "pand %%mm0, %%mm4 \n\t"\
368 "por %%mm1, %%mm4 \n\t"\
369 "por %%mm2, %%mm4 \n\t"\
370 "por %%mm3, %%mm4 \n\t"\
371 "packssdw %%mm4,%%mm4 \n\t"\
372 "movd %%mm4, %%eax \n\t"\
373 "orl %%eax, %%eax \n\t"\
374 "jz 1f \n\t"\
0a8d8945
MN
375 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
376 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
377 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
378 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
379 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
380 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
381 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
382 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
383 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
384 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
37e8dcda 385 #rounder ", %%mm4 \n\t"\
0a8d8945
MN
386 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
387 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
388 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
389 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
390 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
391 #rounder ", %%mm0 \n\t"\
392 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
393 "paddd %%mm0, %%mm0 \n\t" \
394 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
395 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
396 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
397 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
398 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
399 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
37e8dcda 400 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
0a8d8945
MN
401 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
402 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
403 "psrad $" #shift ", %%mm7 \n\t"\
37e8dcda 404 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
405 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
406 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
407 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
408 "psrad $" #shift ", %%mm1 \n\t"\
409 "psrad $" #shift ", %%mm2 \n\t"\
410 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
411 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
412 "movq %%mm7, " #dst " \n\t"\
413 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
414 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
415 "movq %%mm2, 24+" #dst " \n\t"\
416 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
417 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
418 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
37e8dcda 419 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
0a8d8945
MN
420 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
421 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
422 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
423 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
424 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
37e8dcda
A
425 "psrad $" #shift ", %%mm2 \n\t"\
426 "psrad $" #shift ", %%mm0 \n\t"\
0a8d8945
MN
427 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
428 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
429 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
430 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
431 "psrad $" #shift ", %%mm6 \n\t"\
432 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
433 "movq %%mm2, 8+" #dst " \n\t"\
434 "psrad $" #shift ", %%mm4 \n\t"\
435 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
436 "movq %%mm4, 16+" #dst " \n\t"\
37e8dcda
A
437 "jmp 2f \n\t"\
438 "1: \n\t"\
0a8d8945 439 "pslld $16, %%mm0 \n\t"\
4bdd9157 440 "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
0a8d8945
MN
441 "psrad $13, %%mm0 \n\t"\
442 "packssdw %%mm0, %%mm0 \n\t"\
443 "movq %%mm0, " #dst " \n\t"\
444 "movq %%mm0, 8+" #dst " \n\t"\
445 "movq %%mm0, 16+" #dst " \n\t"\
446 "movq %%mm0, 24+" #dst " \n\t"\
447 "2: \n\t"
37e8dcda 448
37e8dcda 449
0a8d8945
MN
450//IDCT( src0, src4, src1, src5, dst, rounder, shift)
451ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
452/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
453ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
454ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
455
456DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
457DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
458DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
459
460
461//IDCT( src0, src4, src1, src5, dst, rounder, shift)
462COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
463COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
464COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
465COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
37e8dcda 466
0a8d8945
MN
467#else
468
469#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
470 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
471 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
37e8dcda
A
472 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
473 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
4bdd9157 474 "movq "MANGLE(wm1010)", %%mm4 \n\t"\
37e8dcda
A
475 "pand %%mm0, %%mm4 \n\t"\
476 "por %%mm1, %%mm4 \n\t"\
477 "por %%mm2, %%mm4 \n\t"\
478 "por %%mm3, %%mm4 \n\t"\
479 "packssdw %%mm4,%%mm4 \n\t"\
480 "movd %%mm4, %%eax \n\t"\
481 "orl %%eax, %%eax \n\t"\
482 "jz 1f \n\t"\
0a8d8945
MN
483 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
484 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
485 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
486 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
487 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
488 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
489 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
490 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
491 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
492 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
37e8dcda 493 #rounder ", %%mm4 \n\t"\
0a8d8945
MN
494 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
495 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
496 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
497 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
498 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
499 #rounder ", %%mm0 \n\t"\
500 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
501 "paddd %%mm0, %%mm0 \n\t" \
502 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
503 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
504 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
505 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
506 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
507 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
37e8dcda 508 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
0a8d8945
MN
509 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
510 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
511 "psrad $" #shift ", %%mm7 \n\t"\
37e8dcda 512 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
513 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
514 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
515 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
516 "psrad $" #shift ", %%mm1 \n\t"\
517 "psrad $" #shift ", %%mm2 \n\t"\
518 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
519 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
520 "movq %%mm7, " #dst " \n\t"\
521 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
522 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
523 "movq %%mm2, 24+" #dst " \n\t"\
524 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
525 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
526 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
37e8dcda 527 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
0a8d8945
MN
528 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
529 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
530 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
531 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
532 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
37e8dcda
A
533 "psrad $" #shift ", %%mm2 \n\t"\
534 "psrad $" #shift ", %%mm0 \n\t"\
0a8d8945
MN
535 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
536 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
537 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
538 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
539 "psrad $" #shift ", %%mm6 \n\t"\
540 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
541 "movq %%mm2, 8+" #dst " \n\t"\
542 "psrad $" #shift ", %%mm4 \n\t"\
543 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
544 "movq %%mm4, 16+" #dst " \n\t"\
37e8dcda 545 "jmp 2f \n\t"\
37e8dcda 546 "1: \n\t"\
0a8d8945 547 "pslld $16, %%mm0 \n\t"\
4bdd9157 548 "paddd "MANGLE(d40000)", %%mm0 \n\t"\
0a8d8945
MN
549 "psrad $13, %%mm0 \n\t"\
550 "packssdw %%mm0, %%mm0 \n\t"\
551 "movq %%mm0, " #dst " \n\t"\
552 "movq %%mm0, 8+" #dst " \n\t"\
553 "movq %%mm0, 16+" #dst " \n\t"\
554 "movq %%mm0, 24+" #dst " \n\t"\
555 "2: \n\t"
37e8dcda 556
0a8d8945
MN
557#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
558 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
559 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
37e8dcda
A
560 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
561 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
562 "movq %%mm0, %%mm4 \n\t"\
563 "por %%mm1, %%mm4 \n\t"\
564 "por %%mm2, %%mm4 \n\t"\
565 "por %%mm3, %%mm4 \n\t"\
0a8d8945 566 "packssdw %%mm4,%%mm4 \n\t"\
37e8dcda
A
567 "movd %%mm4, %%eax \n\t"\
568 "orl %%eax, %%eax \n\t"\
569 "jz " #bt " \n\t"\
0a8d8945
MN
570 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
571 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
572 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
573 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
574 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
575 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
576 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
577 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
578 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
579 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
37e8dcda 580 #rounder ", %%mm4 \n\t"\
0a8d8945
MN
581 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
582 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
583 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
584 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
585 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
586 #rounder ", %%mm0 \n\t"\
587 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
588 "paddd %%mm0, %%mm0 \n\t" \
589 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
590 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
591 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
592 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
593 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
594 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
37e8dcda 595 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
0a8d8945
MN
596 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
597 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
598 "psrad $" #shift ", %%mm7 \n\t"\
37e8dcda 599 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
600 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
601 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
602 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
603 "psrad $" #shift ", %%mm1 \n\t"\
604 "psrad $" #shift ", %%mm2 \n\t"\
605 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
606 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
607 "movq %%mm7, " #dst " \n\t"\
608 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
609 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
610 "movq %%mm2, 24+" #dst " \n\t"\
611 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
612 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
613 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
37e8dcda 614 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
0a8d8945
MN
615 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
616 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
617 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
618 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
619 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
620 "psrad $" #shift ", %%mm2 \n\t"\
621 "psrad $" #shift ", %%mm0 \n\t"\
622 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
623 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
624 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
625 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
37e8dcda 626 "psrad $" #shift ", %%mm6 \n\t"\
0a8d8945
MN
627 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
628 "movq %%mm2, 8+" #dst " \n\t"\
37e8dcda 629 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
630 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
631 "movq %%mm4, 16+" #dst " \n\t"\
632
633#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
634 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
635 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
636 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
637 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
638 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
639 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
640 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
641 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
642 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
643 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
644 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
645 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
646 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
647 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
648 #rounder ", %%mm4 \n\t"\
649 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
650 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
651 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
652 "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
653 "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
37e8dcda 654 #rounder ", %%mm0 \n\t"\
0a8d8945
MN
655 "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
656 "paddd %%mm0, %%mm0 \n\t" \
657 "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
658 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
659 "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
660 "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
661 "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
662 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
663 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
664 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
665 "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
666 "psrad $" #shift ", %%mm7 \n\t"\
667 "psrad $" #shift ", %%mm4 \n\t"\
668 "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
669 "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
670 "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
671 "psrad $" #shift ", %%mm1 \n\t"\
672 "psrad $" #shift ", %%mm2 \n\t"\
673 "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
674 "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
675 "movq %%mm7, " #dst " \n\t"\
676 "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
677 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
678 "movq %%mm2, 24+" #dst " \n\t"\
679 "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
680 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
681 "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
682 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
683 "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
684 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
685 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
686 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
687 "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
37e8dcda
A
688 "psrad $" #shift ", %%mm2 \n\t"\
689 "psrad $" #shift ", %%mm0 \n\t"\
0a8d8945
MN
690 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
691 "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
692 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
693 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
694 "psrad $" #shift ", %%mm6 \n\t"\
695 "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
696 "movq %%mm2, 8+" #dst " \n\t"\
697 "psrad $" #shift ", %%mm4 \n\t"\
698 "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
699 "movq %%mm4, 16+" #dst " \n\t"\
700
701//IDCT( src0, src4, src1, src5, dst, rounder, shift)
702DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
703Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
704Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
705Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
706
707#undef IDCT
708#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
709 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
710 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
37e8dcda
A
711 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
712 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
0a8d8945
MN
713 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
714 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
715 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
716 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
717 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
718 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
719 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
720 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
721 #rounder ", %%mm4 \n\t"\
722 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
723 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
724 #rounder ", %%mm0 \n\t"\
725 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
37e8dcda 726 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
0a8d8945
MN
727 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
728 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
729 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
730 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
731 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
732 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
733 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
734 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
735 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
736 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
737 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
37e8dcda 738 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
0a8d8945
MN
739 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
740 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
741 "psrad $" #shift ", %%mm7 \n\t"\
37e8dcda 742 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
743 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
744 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
745 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
746 "psrad $" #shift ", %%mm0 \n\t"\
747 "psrad $" #shift ", %%mm2 \n\t"\
748 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
749 "movd %%mm7, " #dst " \n\t"\
750 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
751 "movd %%mm0, 16+" #dst " \n\t"\
752 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
753 "movd %%mm2, 96+" #dst " \n\t"\
754 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
755 "movd %%mm4, 112+" #dst " \n\t"\
756 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
757 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
758 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
759 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
760 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
37e8dcda 761 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
0a8d8945
MN
762 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
763 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
764 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
765 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
766 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
767 "psrad $" #shift ", %%mm2 \n\t"\
768 "psrad $" #shift ", %%mm5 \n\t"\
769 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
770 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
771 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
772 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
37e8dcda
A
773 "psrad $" #shift ", %%mm6 \n\t"\
774 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
775 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
776 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
777 "movd %%mm2, 32+" #dst " \n\t"\
778 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
779 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
780 "movd %%mm6, 48+" #dst " \n\t"\
781 "movd %%mm4, 64+" #dst " \n\t"\
782 "movd %%mm5, 80+" #dst " \n\t"
783
784
785//IDCT( src0, src4, src1, src5, dst, rounder, shift)
786IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
787IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
788IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
789IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
37e8dcda
A
790 "jmp 9f \n\t"
791
792 "#.balign 16 \n\t"\
793 "4: \n\t"
0a8d8945
MN
794Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
795Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
37e8dcda 796
0a8d8945
MN
797#undef IDCT
798#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
799 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
800 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
37e8dcda 801 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
0a8d8945
MN
802 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
803 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
804 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
805 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
806 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
807 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
808 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
809 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
810 #rounder ", %%mm4 \n\t"\
811 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
812 #rounder ", %%mm0 \n\t"\
37e8dcda 813 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
0a8d8945
MN
814 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
815 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
816 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
817 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
818 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
819 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
37e8dcda
A
820 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
821 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
0a8d8945
MN
822 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
823 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
824 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
825 "psrad $" #shift ", %%mm1 \n\t"\
37e8dcda 826 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
827 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
828 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
829 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
37e8dcda 830 "psrad $" #shift ", %%mm0 \n\t"\
0a8d8945
MN
831 "psrad $" #shift ", %%mm2 \n\t"\
832 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
833 "movd %%mm1, " #dst " \n\t"\
834 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
835 "movd %%mm0, 16+" #dst " \n\t"\
836 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
837 "movd %%mm2, 96+" #dst " \n\t"\
838 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
839 "movd %%mm4, 112+" #dst " \n\t"\
840 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
841 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
842 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
843 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
844 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
845 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
846 "psrad $" #shift ", %%mm2 \n\t"\
847 "psrad $" #shift ", %%mm5 \n\t"\
848 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
849 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
850 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
851 "psrad $" #shift ", %%mm6 \n\t"\
852 "psrad $" #shift ", %%mm1 \n\t"\
853 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
854 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
855 "movd %%mm2, 32+" #dst " \n\t"\
856 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
857 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
858 "movd %%mm6, 48+" #dst " \n\t"\
859 "movd %%mm1, 64+" #dst " \n\t"\
115329f1 860 "movd %%mm5, 80+" #dst " \n\t"
0a8d8945
MN
861
862//IDCT( src0, src4, src1, src5, dst, rounder, shift)
863IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
864IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
865IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
866IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
37e8dcda
A
867 "jmp 9f \n\t"
868
869 "#.balign 16 \n\t"\
870 "6: \n\t"
0a8d8945 871Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
37e8dcda 872
0a8d8945
MN
873#undef IDCT
874#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
875 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
37e8dcda 876 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
0a8d8945
MN
877 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
878 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
879 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
880 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
881 #rounder ", %%mm4 \n\t"\
882 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
883 #rounder ", %%mm0 \n\t"\
884 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
885 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
886 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
37e8dcda
A
887 "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
888 "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
0a8d8945
MN
889 "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
890 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
891 "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
892 "psrad $" #shift ", %%mm1 \n\t"\
37e8dcda 893 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
894 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
895 "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
896 "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
37e8dcda 897 "psrad $" #shift ", %%mm0 \n\t"\
0a8d8945
MN
898 "psrad $" #shift ", %%mm2 \n\t"\
899 "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
900 "movd %%mm1, " #dst " \n\t"\
901 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
902 "movd %%mm0, 16+" #dst " \n\t"\
903 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
904 "movd %%mm2, 96+" #dst " \n\t"\
905 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
906 "movd %%mm4, 112+" #dst " \n\t"\
907 "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
908 "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
909 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
910 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
911 "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
912 "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
913 "psrad $" #shift ", %%mm2 \n\t"\
914 "psrad $" #shift ", %%mm5 \n\t"\
915 "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
916 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
917 "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
918 "psrad $" #shift ", %%mm6 \n\t"\
919 "psrad $" #shift ", %%mm1 \n\t"\
920 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
921 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
922 "movd %%mm2, 32+" #dst " \n\t"\
923 "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
924 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
925 "movd %%mm6, 48+" #dst " \n\t"\
926 "movd %%mm1, 64+" #dst " \n\t"\
115329f1 927 "movd %%mm5, 80+" #dst " \n\t"
0a8d8945
MN
928
929
930//IDCT( src0, src4, src1, src5, dst, rounder, shift)
931IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
932IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
933IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
934IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
37e8dcda
A
935 "jmp 9f \n\t"
936
937 "#.balign 16 \n\t"\
938 "2: \n\t"
0a8d8945 939Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
37e8dcda 940
0a8d8945
MN
941#undef IDCT
942#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
943 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
37e8dcda
A
944 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
945 "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
0a8d8945
MN
946 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
947 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
948 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
949 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
950 #rounder ", %%mm4 \n\t"\
951 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
952 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
953 #rounder ", %%mm0 \n\t"\
954 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
955 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
956 "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
957 "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
958 "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
959 "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
960 "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
961 "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
962 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
37e8dcda 963 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
0a8d8945
MN
964 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
965 "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
966 "psrad $" #shift ", %%mm7 \n\t"\
37e8dcda 967 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
968 "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
969 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
970 "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
971 "psrad $" #shift ", %%mm0 \n\t"\
972 "psrad $" #shift ", %%mm2 \n\t"\
973 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
974 "movd %%mm7, " #dst " \n\t"\
975 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
976 "movd %%mm0, 16+" #dst " \n\t"\
977 "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
978 "movd %%mm2, 96+" #dst " \n\t"\
979 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
980 "movd %%mm4, 112+" #dst " \n\t"\
981 "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
982 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
983 "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
984 "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
985 "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
37e8dcda 986 "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
0a8d8945
MN
987 "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
988 "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
989 "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
990 "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
991 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
992 "psrad $" #shift ", %%mm2 \n\t"\
993 "psrad $" #shift ", %%mm5 \n\t"\
994 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
995 "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
996 "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
997 "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
37e8dcda
A
998 "psrad $" #shift ", %%mm6 \n\t"\
999 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
1000 "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
1001 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1002 "movd %%mm2, 32+" #dst " \n\t"\
1003 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1004 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1005 "movd %%mm6, 48+" #dst " \n\t"\
1006 "movd %%mm4, 64+" #dst " \n\t"\
1007 "movd %%mm5, 80+" #dst " \n\t"
1008
1009//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1010IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1011IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1012IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1013IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
37e8dcda
A
1014 "jmp 9f \n\t"
1015
1016 "#.balign 16 \n\t"\
1017 "3: \n\t"
0a8d8945
MN
1018#undef IDCT
1019#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1020 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
37e8dcda 1021 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
0a8d8945
MN
1022 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1023 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1024 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1025 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1026 #rounder ", %%mm4 \n\t"\
1027 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1028 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1029 #rounder ", %%mm0 \n\t"\
1030 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1031 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1032 "movq 64(%2), %%mm3 \n\t"\
1033 "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1034 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
37e8dcda 1035 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
0a8d8945
MN
1036 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1037 "psrad $" #shift ", %%mm7 \n\t"\
37e8dcda 1038 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
1039 "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
1040 "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1041 "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1042 "psrad $" #shift ", %%mm0 \n\t"\
1043 "psrad $" #shift ", %%mm1 \n\t"\
1044 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1045 "movd %%mm7, " #dst " \n\t"\
1046 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1047 "movd %%mm0, 16+" #dst " \n\t"\
1048 "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
1049 "movd %%mm1, 96+" #dst " \n\t"\
1050 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1051 "movd %%mm4, 112+" #dst " \n\t"\
1052 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1053 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1054 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1055 "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
1056 "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1057 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1058 "psrad $" #shift ", %%mm1 \n\t"\
37e8dcda 1059 "psrad $" #shift ", %%mm5 \n\t"\
0a8d8945
MN
1060 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1061 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1062 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
37e8dcda
A
1063 "psrad $" #shift ", %%mm6 \n\t"\
1064 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
1065 "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
1066 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1067 "movd %%mm1, 32+" #dst " \n\t"\
1068 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1069 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1070 "movd %%mm6, 48+" #dst " \n\t"\
1071 "movd %%mm4, 64+" #dst " \n\t"\
1072 "movd %%mm5, 80+" #dst " \n\t"
1073
1074
1075//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1076IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1077IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1078IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1079IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
37e8dcda
A
1080 "jmp 9f \n\t"
1081
1082 "#.balign 16 \n\t"\
1083 "5: \n\t"
0a8d8945
MN
1084#undef IDCT
1085#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1086 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1087 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1088 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1089 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1090 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1091 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1092 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1093 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1094 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1095 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1096 #rounder ", %%mm4 \n\t"\
1097 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
37e8dcda 1098 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
0a8d8945
MN
1099 #rounder ", %%mm0 \n\t"\
1100 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1101 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1102 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1103 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1104 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1105 "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
1106 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1107 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1108 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1109 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1110 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1111 "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1112 "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1113 #rounder ", %%mm1 \n\t"\
1114 "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
1115 "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
1116 #rounder ", %%mm2 \n\t"\
1117 "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
1118 "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
1119 "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
1120 "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
37e8dcda 1121 "psrad $" #shift ", %%mm4 \n\t"\
37e8dcda 1122 "psrad $" #shift ", %%mm7 \n\t"\
0a8d8945
MN
1123 "psrad $" #shift ", %%mm3 \n\t"\
1124 "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
1125 "movq %%mm4, " #dst " \n\t"\
37e8dcda 1126 "psrad $" #shift ", %%mm0 \n\t"\
0a8d8945
MN
1127 "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
1128 "movq %%mm0, 16+" #dst " \n\t"\
1129 "movq %%mm0, 96+" #dst " \n\t"\
1130 "movq %%mm4, 112+" #dst " \n\t"\
1131 "psrad $" #shift ", %%mm5 \n\t"\
1132 "psrad $" #shift ", %%mm6 \n\t"\
37e8dcda 1133 "psrad $" #shift ", %%mm2 \n\t"\
0a8d8945
MN
1134 "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1135 "movq %%mm5, 32+" #dst " \n\t"\
1136 "psrad $" #shift ", %%mm1 \n\t"\
1137 "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1138 "movq %%mm6, 48+" #dst " \n\t"\
1139 "movq %%mm6, 64+" #dst " \n\t"\
115329f1
DB
1140 "movq %%mm5, 80+" #dst " \n\t"
1141
0a8d8945
MN
1142
1143//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1144IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1145//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1146IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1147//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
37e8dcda
A
1148 "jmp 9f \n\t"
1149
1150
1151 "#.balign 16 \n\t"\
1152 "1: \n\t"
0a8d8945
MN
1153#undef IDCT
1154#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1155 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1156 "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
37e8dcda 1157 "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
0a8d8945
MN
1158 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1159 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1160 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1161 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1162 "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1163 "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1164 "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1165 "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1166 #rounder ", %%mm4 \n\t"\
1167 "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1168 "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1169 #rounder ", %%mm0 \n\t"\
1170 "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
37e8dcda 1171 "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
0a8d8945
MN
1172 "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1173 "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1174 "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1175 "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1176 "movq 64(%2), %%mm1 \n\t"\
1177 "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1178 "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
37e8dcda 1179 "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
0a8d8945
MN
1180 "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1181 "psrad $" #shift ", %%mm7 \n\t"\
37e8dcda 1182 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
1183 "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1184 "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1185 "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1186 "psrad $" #shift ", %%mm0 \n\t"\
1187 "psrad $" #shift ", %%mm3 \n\t"\
1188 "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1189 "movd %%mm7, " #dst " \n\t"\
1190 "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1191 "movd %%mm0, 16+" #dst " \n\t"\
1192 "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1193 "movd %%mm3, 96+" #dst " \n\t"\
1194 "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1195 "movd %%mm4, 112+" #dst " \n\t"\
1196 "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1197 "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1198 "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1199 "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1200 "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1201 "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1202 "psrad $" #shift ", %%mm3 \n\t"\
1203 "psrad $" #shift ", %%mm5 \n\t"\
1204 "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1205 "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1206 "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
37e8dcda 1207 "psrad $" #shift ", %%mm6 \n\t"\
0a8d8945
MN
1208 "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1209 "movd %%mm3, 32+" #dst " \n\t"\
37e8dcda 1210 "psrad $" #shift ", %%mm4 \n\t"\
0a8d8945
MN
1211 "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1212 "movd %%mm6, 48+" #dst " \n\t"\
1213 "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1214 "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1215 "movd %%mm4, 64+" #dst " \n\t"\
1216 "movd %%mm5, 80+" #dst " \n\t"
115329f1 1217
0a8d8945
MN
1218
1219//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1220IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1221IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1222IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1223IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
37e8dcda
A
1224 "jmp 9f \n\t"
1225
1226
1227 "#.balign 16 \n\t"
1228 "7: \n\t"
0a8d8945
MN
1229#undef IDCT
1230#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
1231 "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1232 "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1233 "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1234 "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1235 "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1236 #rounder ", %%mm4 \n\t"\
1237 #rounder ", %%mm0 \n\t"\
37e8dcda 1238 "psrad $" #shift ", %%mm4 \n\t"\
37e8dcda 1239 "psrad $" #shift ", %%mm0 \n\t"\
0a8d8945
MN
1240 "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1241 "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1242 "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1243 "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1244 "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1245 "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1246 #rounder ", %%mm1 \n\t"\
1247 #rounder ", %%mm2 \n\t"\
37e8dcda 1248 "psrad $" #shift ", %%mm1 \n\t"\
0a8d8945
MN
1249 "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1250 "movq %%mm4, " #dst " \n\t"\
1251 "psrad $" #shift ", %%mm2 \n\t"\
1252 "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1253 "movq %%mm0, 16+" #dst " \n\t"\
1254 "movq %%mm0, 96+" #dst " \n\t"\
1255 "movq %%mm4, 112+" #dst " \n\t"\
1256 "movq %%mm0, 32+" #dst " \n\t"\
1257 "movq %%mm4, 48+" #dst " \n\t"\
1258 "movq %%mm4, 64+" #dst " \n\t"\
115329f1 1259 "movq %%mm0, 80+" #dst " \n\t"
0a8d8945
MN
1260
1261//IDCT( src0, src4, src1, src5, dst, rounder, shift)
1262IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
1263//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
1264IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
1265//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
37e8dcda
A
1266
1267
1268#endif
1269
1270/*
1271Input
0a8d8945
MN
1272 00 40 04 44 20 60 24 64
1273 10 30 14 34 50 70 54 74
1274 01 41 03 43 21 61 23 63
37e8dcda 1275 11 31 13 33 51 71 53 73
0a8d8945
MN
1276 02 42 06 46 22 62 26 66
1277 12 32 16 36 52 72 56 76
1278 05 45 07 47 25 65 27 67
1279 15 35 17 37 55 75 57 77
115329f1 1280
37e8dcda 1281Temp
0a8d8945
MN
1282 00 04 10 14 20 24 30 34
1283 40 44 50 54 60 64 70 74
37e8dcda
A
1284 01 03 11 13 21 23 31 33
1285 41 43 51 53 61 63 71 73
0a8d8945
MN
1286 02 06 12 16 22 26 32 36
1287 42 46 52 56 62 66 72 76
37e8dcda
A
1288 05 07 15 17 25 27 35 37
1289 45 47 55 57 65 67 75 77
1290*/
1291
37e8dcda
A
1292"9: \n\t"
1293 :: "r" (block), "r" (temp), "r" (coeffs)
1294 : "%eax"
1295 );
37e8dcda
A
1296}
1297
2ad1516a 1298void ff_simple_idct_mmx(int16_t *block)
37e8dcda 1299{
2ad1516a
MN
1300 idct(block);
1301}
1302
1303//FIXME merge add/put into the idct
1304
0c1a9eda 1305void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
2ad1516a
MN
1306{
1307 idct(block);
ec7e0bf0 1308 put_pixels_clamped_mmx(block, dest, line_size);
2ad1516a 1309}
0c1a9eda 1310void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
2ad1516a
MN
1311{
1312 idct(block);
ec7e0bf0 1313 add_pixels_clamped_mmx(block, dest, line_size);
37e8dcda 1314}