Commit | Line | Data |
---|---|---|
37e8dcda | 1 | /* |
ff4ec49e FB |
2 | * Simple IDCT MMX |
3 | * | |
4 | * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> | |
5 | * | |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, write to the Free Software | |
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
19 | */ | |
37e8dcda | 20 | #include "../dsputil.h" |
e96682e6 | 21 | #include "../simple_idct.h" |
ff4ec49e | 22 | |
9e1795dd MN |
23 | /* |
24 | 23170.475006 | |
25 | 22725.260826 | |
26 | 21406.727617 | |
27 | 19265.545870 | |
28 | 16384.000000 | |
29 | 12872.826198 | |
30 | 8866.956905 | |
31 | 4520.335430 | |
32 | */ | |
37e8dcda A |
33 | #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
34 | #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
35 | #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
36 | #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
9e1795dd | 37 | #if 0 |
37e8dcda | 38 | #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
9e1795dd MN |
39 | #else |
40 | #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 | |
41 | #endif | |
37e8dcda A |
42 | #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
43 | #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
44 | #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
45 | ||
46 | #define ROW_SHIFT 11 | |
47 | #define COL_SHIFT 20 // 6 | |
48 | ||
5c0513bd DB |
49 | static const uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL; |
50 | static const uint64_t attribute_used __attribute__((aligned(8))) d40000= 0x0000000000040000ULL; | |
41338ac0 MN |
51 | |
52 | static const int16_t __attribute__((aligned(8))) coeffs[]= { | |
37e8dcda A |
53 | 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, |
54 | // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, | |
55 | // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), | |
56 | 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, | |
57 | // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :) | |
58 | // 0, 0, 0, 0, | |
59 | // 0, 0, 0, 0, | |
60 | ||
0a8d8945 MN |
61 | C4, C4, C4, C4, |
62 | C4, -C4, C4, -C4, | |
115329f1 | 63 | |
0a8d8945 MN |
64 | C2, C6, C2, C6, |
65 | C6, -C2, C6, -C2, | |
115329f1 | 66 | |
0a8d8945 MN |
67 | C1, C3, C1, C3, |
68 | C5, C7, C5, C7, | |
115329f1 | 69 | |
0a8d8945 MN |
70 | C3, -C7, C3, -C7, |
71 | -C1, -C5, -C1, -C5, | |
115329f1 | 72 | |
0a8d8945 MN |
73 | C5, -C1, C5, -C1, |
74 | C7, C3, C7, C3, | |
115329f1 | 75 | |
0a8d8945 MN |
76 | C7, -C5, C7, -C5, |
77 | C3, -C1, C3, -C1 | |
78 | }; | |
79 | ||
ef5b1b5a | 80 | #if 0 |
0a8d8945 MN |
81 | static void unused_var_killer(){ |
82 | int a= wm1010 + d40000; | |
83 | temp[0]=a; | |
84 | } | |
85 | ||
37e8dcda A |
86 | static void inline idctCol (int16_t * col, int16_t *input) |
87 | { | |
88 | #undef C0 | |
89 | #undef C1 | |
90 | #undef C2 | |
91 | #undef C3 | |
92 | #undef C4 | |
93 | #undef C5 | |
94 | #undef C6 | |
95 | #undef C7 | |
96 | int a0, a1, a2, a3, b0, b1, b2, b3; | |
97 | const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
98 | const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
99 | const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
100 | const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
ccf589a8 | 101 | const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
37e8dcda A |
102 | const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
103 | const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
104 | const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
105 | /* | |
106 | if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) { | |
107 | col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] = | |
108 | col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3; | |
109 | return; | |
110 | }*/ | |
111 | ||
112 | col[8*0] = input[8*0 + 0]; | |
113 | col[8*1] = input[8*2 + 0]; | |
114 | col[8*2] = input[8*0 + 1]; | |
115 | col[8*3] = input[8*2 + 1]; | |
116 | col[8*4] = input[8*4 + 0]; | |
117 | col[8*5] = input[8*6 + 0]; | |
118 | col[8*6] = input[8*4 + 1]; | |
119 | col[8*7] = input[8*6 + 1]; | |
120 | ||
121 | a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1)); | |
122 | a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1)); | |
123 | a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1)); | |
124 | a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1)); | |
125 | ||
126 | b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7]; | |
127 | b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7]; | |
128 | b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7]; | |
129 | b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7]; | |
130 | ||
131 | col[8*0] = (a0 + b0) >> COL_SHIFT; | |
132 | col[8*1] = (a1 + b1) >> COL_SHIFT; | |
133 | col[8*2] = (a2 + b2) >> COL_SHIFT; | |
134 | col[8*3] = (a3 + b3) >> COL_SHIFT; | |
135 | col[8*4] = (a3 - b3) >> COL_SHIFT; | |
136 | col[8*5] = (a2 - b2) >> COL_SHIFT; | |
137 | col[8*6] = (a1 - b1) >> COL_SHIFT; | |
138 | col[8*7] = (a0 - b0) >> COL_SHIFT; | |
139 | } | |
140 | ||
141 | static void inline idctRow (int16_t * output, int16_t * input) | |
142 | { | |
143 | int16_t row[8]; | |
144 | ||
145 | int a0, a1, a2, a3, b0, b1, b2, b3; | |
146 | const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
147 | const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
148 | const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
149 | const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
ccf589a8 | 150 | const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
37e8dcda A |
151 | const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
152 | const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
153 | const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
154 | ||
155 | row[0] = input[0]; | |
156 | row[2] = input[1]; | |
157 | row[4] = input[4]; | |
158 | row[6] = input[5]; | |
159 | row[1] = input[8]; | |
160 | row[3] = input[9]; | |
161 | row[5] = input[12]; | |
162 | row[7] = input[13]; | |
163 | ||
164 | if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) { | |
165 | row[0] = row[1] = row[2] = row[3] = row[4] = | |
166 | row[5] = row[6] = row[7] = row[0]<<3; | |
167 | output[0] = row[0]; | |
168 | output[2] = row[1]; | |
169 | output[4] = row[2]; | |
170 | output[6] = row[3]; | |
171 | output[8] = row[4]; | |
172 | output[10] = row[5]; | |
173 | output[12] = row[6]; | |
174 | output[14] = row[7]; | |
175 | return; | |
176 | } | |
177 | ||
178 | a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1)); | |
179 | a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1)); | |
180 | a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1)); | |
181 | a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1)); | |
182 | ||
183 | b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; | |
184 | b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; | |
185 | b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; | |
186 | b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; | |
187 | ||
188 | row[0] = (a0 + b0) >> ROW_SHIFT; | |
189 | row[1] = (a1 + b1) >> ROW_SHIFT; | |
190 | row[2] = (a2 + b2) >> ROW_SHIFT; | |
191 | row[3] = (a3 + b3) >> ROW_SHIFT; | |
192 | row[4] = (a3 - b3) >> ROW_SHIFT; | |
193 | row[5] = (a2 - b2) >> ROW_SHIFT; | |
194 | row[6] = (a1 - b1) >> ROW_SHIFT; | |
195 | row[7] = (a0 - b0) >> ROW_SHIFT; | |
196 | ||
197 | output[0] = row[0]; | |
198 | output[2] = row[1]; | |
199 | output[4] = row[2]; | |
200 | output[6] = row[3]; | |
201 | output[8] = row[4]; | |
202 | output[10] = row[5]; | |
203 | output[12] = row[6]; | |
204 | output[14] = row[7]; | |
205 | } | |
206 | #endif | |
207 | ||
208 | static inline void idct(int16_t *block) | |
209 | { | |
41338ac0 MN |
210 | int64_t __attribute__((aligned(8))) align_tmp[16]; |
211 | int16_t * const temp= (int16_t*)align_tmp; | |
212 | ||
37e8dcda | 213 | asm volatile( |
37e8dcda | 214 | #if 0 //Alternative, simpler variant |
0a8d8945 MN |
215 | |
216 | #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
217 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
218 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
37e8dcda A |
219 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
220 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
0a8d8945 MN |
221 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
222 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
223 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
224 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
225 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
226 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
227 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
228 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
229 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
230 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
37e8dcda | 231 | #rounder ", %%mm4 \n\t"\ |
0a8d8945 MN |
232 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
233 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
234 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
235 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
236 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
237 | #rounder ", %%mm0 \n\t"\ | |
238 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
239 | "paddd %%mm0, %%mm0 \n\t" \ | |
240 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
241 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
242 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
243 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
244 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
245 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
37e8dcda | 246 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
0a8d8945 MN |
247 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
248 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
249 | "psrad $" #shift ", %%mm7 \n\t"\ | |
250 | "psrad $" #shift ", %%mm4 \n\t"\ | |
251 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
252 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
253 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
254 | "psrad $" #shift ", %%mm1 \n\t"\ | |
255 | "psrad $" #shift ", %%mm2 \n\t"\ | |
256 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
257 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
258 | "movq %%mm7, " #dst " \n\t"\ | |
259 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
260 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
261 | "movq %%mm2, 24+" #dst " \n\t"\ | |
262 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
263 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
264 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
265 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
266 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
267 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
268 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
269 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
270 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
271 | "psrad $" #shift ", %%mm2 \n\t"\ | |
272 | "psrad $" #shift ", %%mm0 \n\t"\ | |
273 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
274 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
275 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
276 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
37e8dcda | 277 | "psrad $" #shift ", %%mm6 \n\t"\ |
0a8d8945 MN |
278 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
279 | "movq %%mm2, 8+" #dst " \n\t"\ | |
37e8dcda | 280 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
281 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
282 | "movq %%mm4, 16+" #dst " \n\t"\ | |
283 | ||
284 | #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
285 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
286 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
287 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
288 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
289 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
290 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
291 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
292 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
293 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
294 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
295 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
296 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
37e8dcda | 297 | #rounder ", %%mm4 \n\t"\ |
0a8d8945 MN |
298 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
299 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
300 | #rounder ", %%mm0 \n\t"\ | |
301 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
302 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
303 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
304 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
305 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
306 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
307 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
308 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
309 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
310 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
311 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
312 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
313 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
314 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
315 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
316 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
317 | "psrad $" #shift ", %%mm7 \n\t"\ | |
37e8dcda | 318 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
319 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
320 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
321 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
322 | "psrad $" #shift ", %%mm0 \n\t"\ | |
323 | "psrad $" #shift ", %%mm2 \n\t"\ | |
324 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
325 | "movd %%mm7, " #dst " \n\t"\ | |
326 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
327 | "movd %%mm0, 16+" #dst " \n\t"\ | |
328 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
329 | "movd %%mm2, 96+" #dst " \n\t"\ | |
330 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
331 | "movd %%mm4, 112+" #dst " \n\t"\ | |
332 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
333 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
334 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
335 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
336 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
37e8dcda | 337 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
0a8d8945 MN |
338 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
339 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
340 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
341 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
342 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
343 | "psrad $" #shift ", %%mm2 \n\t"\ | |
344 | "psrad $" #shift ", %%mm5 \n\t"\ | |
345 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
346 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
347 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
348 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
37e8dcda A |
349 | "psrad $" #shift ", %%mm6 \n\t"\ |
350 | "psrad $" #shift ", %%mm4 \n\t"\ | |
0a8d8945 MN |
351 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
352 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
353 | "movd %%mm2, 32+" #dst " \n\t"\ | |
354 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
355 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
356 | "movd %%mm6, 48+" #dst " \n\t"\ | |
357 | "movd %%mm4, 64+" #dst " \n\t"\ | |
358 | "movd %%mm5, 80+" #dst " \n\t"\ | |
359 | ||
115329f1 | 360 | |
0a8d8945 MN |
361 | #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
362 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
363 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
37e8dcda A |
364 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
365 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
4bdd9157 | 366 | "movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
37e8dcda A |
367 | "pand %%mm0, %%mm4 \n\t"\ |
368 | "por %%mm1, %%mm4 \n\t"\ | |
369 | "por %%mm2, %%mm4 \n\t"\ | |
370 | "por %%mm3, %%mm4 \n\t"\ | |
371 | "packssdw %%mm4,%%mm4 \n\t"\ | |
372 | "movd %%mm4, %%eax \n\t"\ | |
373 | "orl %%eax, %%eax \n\t"\ | |
374 | "jz 1f \n\t"\ | |
0a8d8945 MN |
375 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
376 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
377 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
378 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
379 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
380 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
381 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
382 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
383 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
384 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
37e8dcda | 385 | #rounder ", %%mm4 \n\t"\ |
0a8d8945 MN |
386 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
387 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
388 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
389 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
390 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
391 | #rounder ", %%mm0 \n\t"\ | |
392 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
393 | "paddd %%mm0, %%mm0 \n\t" \ | |
394 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
395 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
396 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
397 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
398 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
399 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
37e8dcda | 400 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
0a8d8945 MN |
401 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
402 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
403 | "psrad $" #shift ", %%mm7 \n\t"\ | |
37e8dcda | 404 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
405 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
406 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
407 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
408 | "psrad $" #shift ", %%mm1 \n\t"\ | |
409 | "psrad $" #shift ", %%mm2 \n\t"\ | |
410 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
411 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
412 | "movq %%mm7, " #dst " \n\t"\ | |
413 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
414 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
415 | "movq %%mm2, 24+" #dst " \n\t"\ | |
416 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
417 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
418 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
37e8dcda | 419 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
0a8d8945 MN |
420 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
421 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
422 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
423 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
424 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
37e8dcda A |
425 | "psrad $" #shift ", %%mm2 \n\t"\ |
426 | "psrad $" #shift ", %%mm0 \n\t"\ | |
0a8d8945 MN |
427 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
428 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
429 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
430 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
431 | "psrad $" #shift ", %%mm6 \n\t"\ | |
432 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
433 | "movq %%mm2, 8+" #dst " \n\t"\ | |
434 | "psrad $" #shift ", %%mm4 \n\t"\ | |
435 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
436 | "movq %%mm4, 16+" #dst " \n\t"\ | |
37e8dcda A |
437 | "jmp 2f \n\t"\ |
438 | "1: \n\t"\ | |
0a8d8945 | 439 | "pslld $16, %%mm0 \n\t"\ |
4bdd9157 | 440 | "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
0a8d8945 MN |
441 | "psrad $13, %%mm0 \n\t"\ |
442 | "packssdw %%mm0, %%mm0 \n\t"\ | |
443 | "movq %%mm0, " #dst " \n\t"\ | |
444 | "movq %%mm0, 8+" #dst " \n\t"\ | |
445 | "movq %%mm0, 16+" #dst " \n\t"\ | |
446 | "movq %%mm0, 24+" #dst " \n\t"\ | |
447 | "2: \n\t" | |
37e8dcda | 448 | |
37e8dcda | 449 | |
0a8d8945 MN |
450 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
451 | ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
452 | /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) | |
453 | ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) | |
454 | ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ | |
455 | ||
456 | DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) | |
457 | DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) | |
458 | DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) | |
459 | ||
460 | ||
461 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
462 | COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
463 | COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
464 | COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
465 | COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
37e8dcda | 466 | |
0a8d8945 MN |
467 | #else |
468 | ||
469 | #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
470 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
471 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
37e8dcda A |
472 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
473 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
4bdd9157 | 474 | "movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
37e8dcda A |
475 | "pand %%mm0, %%mm4 \n\t"\ |
476 | "por %%mm1, %%mm4 \n\t"\ | |
477 | "por %%mm2, %%mm4 \n\t"\ | |
478 | "por %%mm3, %%mm4 \n\t"\ | |
479 | "packssdw %%mm4,%%mm4 \n\t"\ | |
480 | "movd %%mm4, %%eax \n\t"\ | |
481 | "orl %%eax, %%eax \n\t"\ | |
482 | "jz 1f \n\t"\ | |
0a8d8945 MN |
483 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
484 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
485 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
486 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
487 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
488 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
489 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
490 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
491 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
492 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
37e8dcda | 493 | #rounder ", %%mm4 \n\t"\ |
0a8d8945 MN |
494 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
495 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
496 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
497 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
498 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
499 | #rounder ", %%mm0 \n\t"\ | |
500 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
501 | "paddd %%mm0, %%mm0 \n\t" \ | |
502 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
503 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
504 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
505 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
506 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
507 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
37e8dcda | 508 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
0a8d8945 MN |
509 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
510 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
511 | "psrad $" #shift ", %%mm7 \n\t"\ | |
37e8dcda | 512 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
513 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
514 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
515 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
516 | "psrad $" #shift ", %%mm1 \n\t"\ | |
517 | "psrad $" #shift ", %%mm2 \n\t"\ | |
518 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
519 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
520 | "movq %%mm7, " #dst " \n\t"\ | |
521 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
522 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
523 | "movq %%mm2, 24+" #dst " \n\t"\ | |
524 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
525 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
526 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
37e8dcda | 527 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
0a8d8945 MN |
528 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
529 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
530 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
531 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
532 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
37e8dcda A |
533 | "psrad $" #shift ", %%mm2 \n\t"\ |
534 | "psrad $" #shift ", %%mm0 \n\t"\ | |
0a8d8945 MN |
535 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
536 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
537 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
538 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
539 | "psrad $" #shift ", %%mm6 \n\t"\ | |
540 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
541 | "movq %%mm2, 8+" #dst " \n\t"\ | |
542 | "psrad $" #shift ", %%mm4 \n\t"\ | |
543 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
544 | "movq %%mm4, 16+" #dst " \n\t"\ | |
37e8dcda | 545 | "jmp 2f \n\t"\ |
37e8dcda | 546 | "1: \n\t"\ |
0a8d8945 | 547 | "pslld $16, %%mm0 \n\t"\ |
4bdd9157 | 548 | "paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
0a8d8945 MN |
549 | "psrad $13, %%mm0 \n\t"\ |
550 | "packssdw %%mm0, %%mm0 \n\t"\ | |
551 | "movq %%mm0, " #dst " \n\t"\ | |
552 | "movq %%mm0, 8+" #dst " \n\t"\ | |
553 | "movq %%mm0, 16+" #dst " \n\t"\ | |
554 | "movq %%mm0, 24+" #dst " \n\t"\ | |
555 | "2: \n\t" | |
37e8dcda | 556 | |
0a8d8945 MN |
557 | #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ |
558 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
559 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
37e8dcda A |
560 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
561 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
562 | "movq %%mm0, %%mm4 \n\t"\ | |
563 | "por %%mm1, %%mm4 \n\t"\ | |
564 | "por %%mm2, %%mm4 \n\t"\ | |
565 | "por %%mm3, %%mm4 \n\t"\ | |
0a8d8945 | 566 | "packssdw %%mm4,%%mm4 \n\t"\ |
37e8dcda A |
567 | "movd %%mm4, %%eax \n\t"\ |
568 | "orl %%eax, %%eax \n\t"\ | |
569 | "jz " #bt " \n\t"\ | |
0a8d8945 MN |
570 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
571 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
572 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
573 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
574 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
575 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
576 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
577 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
578 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
579 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
37e8dcda | 580 | #rounder ", %%mm4 \n\t"\ |
0a8d8945 MN |
581 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
582 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
583 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
584 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
585 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
586 | #rounder ", %%mm0 \n\t"\ | |
587 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ | |
588 | "paddd %%mm0, %%mm0 \n\t" \ | |
589 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
590 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
591 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
592 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
593 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
594 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
37e8dcda | 595 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
0a8d8945 MN |
596 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
597 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
598 | "psrad $" #shift ", %%mm7 \n\t"\ | |
37e8dcda | 599 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
600 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
601 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
602 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
603 | "psrad $" #shift ", %%mm1 \n\t"\ | |
604 | "psrad $" #shift ", %%mm2 \n\t"\ | |
605 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
606 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
607 | "movq %%mm7, " #dst " \n\t"\ | |
608 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
609 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
610 | "movq %%mm2, 24+" #dst " \n\t"\ | |
611 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
612 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
613 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
37e8dcda | 614 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
0a8d8945 MN |
615 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
616 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
617 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
618 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
619 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
620 | "psrad $" #shift ", %%mm2 \n\t"\ | |
621 | "psrad $" #shift ", %%mm0 \n\t"\ | |
622 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
623 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
624 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
625 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
37e8dcda | 626 | "psrad $" #shift ", %%mm6 \n\t"\ |
0a8d8945 MN |
627 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
628 | "movq %%mm2, 8+" #dst " \n\t"\ | |
37e8dcda | 629 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
630 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
631 | "movq %%mm4, 16+" #dst " \n\t"\ | |
632 | ||
633 | #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
634 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
635 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
636 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ | |
637 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
638 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
639 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
640 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
641 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
642 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
643 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
644 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
645 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
646 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
647 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
648 | #rounder ", %%mm4 \n\t"\ | |
649 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
650 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ | |
651 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
652 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ | |
653 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
37e8dcda | 654 | #rounder ", %%mm0 \n\t"\ |
0a8d8945 MN |
655 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
656 | "paddd %%mm0, %%mm0 \n\t" \ | |
657 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ | |
658 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
659 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ | |
660 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
661 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
662 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
663 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
664 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
665 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ | |
666 | "psrad $" #shift ", %%mm7 \n\t"\ | |
667 | "psrad $" #shift ", %%mm4 \n\t"\ | |
668 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ | |
669 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ | |
670 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
671 | "psrad $" #shift ", %%mm1 \n\t"\ | |
672 | "psrad $" #shift ", %%mm2 \n\t"\ | |
673 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ | |
674 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ | |
675 | "movq %%mm7, " #dst " \n\t"\ | |
676 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ | |
677 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
678 | "movq %%mm2, 24+" #dst " \n\t"\ | |
679 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
680 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
681 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
682 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
683 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ | |
684 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
685 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
686 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
687 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ | |
37e8dcda A |
688 | "psrad $" #shift ", %%mm2 \n\t"\ |
689 | "psrad $" #shift ", %%mm0 \n\t"\ | |
0a8d8945 MN |
690 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
691 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ | |
692 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
693 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
694 | "psrad $" #shift ", %%mm6 \n\t"\ | |
695 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ | |
696 | "movq %%mm2, 8+" #dst " \n\t"\ | |
697 | "psrad $" #shift ", %%mm4 \n\t"\ | |
698 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ | |
699 | "movq %%mm4, 16+" #dst " \n\t"\ | |
700 | ||
701 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
702 | DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) | |
703 | Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) | |
704 | Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) | |
705 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) | |
706 | ||
707 | #undef IDCT | |
708 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
709 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
710 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
37e8dcda A |
711 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
712 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
0a8d8945 MN |
713 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
714 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
715 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
716 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
717 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
718 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
719 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
720 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
721 | #rounder ", %%mm4 \n\t"\ | |
722 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
723 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
724 | #rounder ", %%mm0 \n\t"\ | |
725 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
37e8dcda | 726 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
0a8d8945 MN |
727 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
728 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
729 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
730 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
731 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
732 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
733 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
734 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
735 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
736 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
737 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
37e8dcda | 738 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
0a8d8945 MN |
739 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
740 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
741 | "psrad $" #shift ", %%mm7 \n\t"\ | |
37e8dcda | 742 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
743 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
744 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
745 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
746 | "psrad $" #shift ", %%mm0 \n\t"\ | |
747 | "psrad $" #shift ", %%mm2 \n\t"\ | |
748 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
749 | "movd %%mm7, " #dst " \n\t"\ | |
750 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
751 | "movd %%mm0, 16+" #dst " \n\t"\ | |
752 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
753 | "movd %%mm2, 96+" #dst " \n\t"\ | |
754 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
755 | "movd %%mm4, 112+" #dst " \n\t"\ | |
756 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
757 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
758 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
759 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
760 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
37e8dcda | 761 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
0a8d8945 MN |
762 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
763 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
764 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
765 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
766 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
767 | "psrad $" #shift ", %%mm2 \n\t"\ | |
768 | "psrad $" #shift ", %%mm5 \n\t"\ | |
769 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
770 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
771 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
772 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
37e8dcda A |
773 | "psrad $" #shift ", %%mm6 \n\t"\ |
774 | "psrad $" #shift ", %%mm4 \n\t"\ | |
0a8d8945 MN |
775 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
776 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
777 | "movd %%mm2, 32+" #dst " \n\t"\ | |
778 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
779 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
780 | "movd %%mm6, 48+" #dst " \n\t"\ | |
781 | "movd %%mm4, 64+" #dst " \n\t"\ | |
782 | "movd %%mm5, 80+" #dst " \n\t" | |
783 | ||
784 | ||
785 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
786 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
787 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
788 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
789 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
37e8dcda A |
790 | "jmp 9f \n\t" |
791 | ||
792 | "#.balign 16 \n\t"\ | |
793 | "4: \n\t" | |
0a8d8945 MN |
794 | Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) |
795 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) | |
37e8dcda | 796 | |
0a8d8945 MN |
797 | #undef IDCT |
798 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
799 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
800 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
37e8dcda | 801 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
0a8d8945 MN |
802 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
803 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
804 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
805 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
806 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
807 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
808 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
809 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
810 | #rounder ", %%mm4 \n\t"\ | |
811 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
812 | #rounder ", %%mm0 \n\t"\ | |
37e8dcda | 813 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
0a8d8945 MN |
814 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
815 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
816 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
817 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
818 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
819 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
37e8dcda A |
820 | "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
821 | "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
0a8d8945 MN |
822 | "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
823 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
824 | "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
825 | "psrad $" #shift ", %%mm1 \n\t"\ | |
37e8dcda | 826 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
827 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
828 | "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
829 | "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
37e8dcda | 830 | "psrad $" #shift ", %%mm0 \n\t"\ |
0a8d8945 MN |
831 | "psrad $" #shift ", %%mm2 \n\t"\ |
832 | "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
833 | "movd %%mm1, " #dst " \n\t"\ | |
834 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
835 | "movd %%mm0, 16+" #dst " \n\t"\ | |
836 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
837 | "movd %%mm2, 96+" #dst " \n\t"\ | |
838 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
839 | "movd %%mm4, 112+" #dst " \n\t"\ | |
840 | "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
841 | "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
842 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
843 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
844 | "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
845 | "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
846 | "psrad $" #shift ", %%mm2 \n\t"\ | |
847 | "psrad $" #shift ", %%mm5 \n\t"\ | |
848 | "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
849 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
850 | "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
851 | "psrad $" #shift ", %%mm6 \n\t"\ | |
852 | "psrad $" #shift ", %%mm1 \n\t"\ | |
853 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
854 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
855 | "movd %%mm2, 32+" #dst " \n\t"\ | |
856 | "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
857 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
858 | "movd %%mm6, 48+" #dst " \n\t"\ | |
859 | "movd %%mm1, 64+" #dst " \n\t"\ | |
115329f1 | 860 | "movd %%mm5, 80+" #dst " \n\t" |
0a8d8945 MN |
861 | |
862 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
863 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
864 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
865 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
866 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
37e8dcda A |
867 | "jmp 9f \n\t" |
868 | ||
869 | "#.balign 16 \n\t"\ | |
870 | "6: \n\t" | |
0a8d8945 | 871 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) |
37e8dcda | 872 | |
0a8d8945 MN |
873 | #undef IDCT |
874 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
875 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
37e8dcda | 876 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
0a8d8945 MN |
877 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
878 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
879 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
880 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
881 | #rounder ", %%mm4 \n\t"\ | |
882 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
883 | #rounder ", %%mm0 \n\t"\ | |
884 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
885 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
886 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
37e8dcda A |
887 | "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
888 | "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
0a8d8945 MN |
889 | "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
890 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ | |
891 | "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
892 | "psrad $" #shift ", %%mm1 \n\t"\ | |
37e8dcda | 893 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
894 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
895 | "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
896 | "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
37e8dcda | 897 | "psrad $" #shift ", %%mm0 \n\t"\ |
0a8d8945 MN |
898 | "psrad $" #shift ", %%mm2 \n\t"\ |
899 | "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ | |
900 | "movd %%mm1, " #dst " \n\t"\ | |
901 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
902 | "movd %%mm0, 16+" #dst " \n\t"\ | |
903 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
904 | "movd %%mm2, 96+" #dst " \n\t"\ | |
905 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
906 | "movd %%mm4, 112+" #dst " \n\t"\ | |
907 | "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ | |
908 | "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ | |
909 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ | |
910 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
911 | "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
912 | "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
913 | "psrad $" #shift ", %%mm2 \n\t"\ | |
914 | "psrad $" #shift ", %%mm5 \n\t"\ | |
915 | "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ | |
916 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
917 | "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ | |
918 | "psrad $" #shift ", %%mm6 \n\t"\ | |
919 | "psrad $" #shift ", %%mm1 \n\t"\ | |
920 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
921 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
922 | "movd %%mm2, 32+" #dst " \n\t"\ | |
923 | "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ | |
924 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
925 | "movd %%mm6, 48+" #dst " \n\t"\ | |
926 | "movd %%mm1, 64+" #dst " \n\t"\ | |
115329f1 | 927 | "movd %%mm5, 80+" #dst " \n\t" |
0a8d8945 MN |
928 | |
929 | ||
930 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
931 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
932 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
933 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
934 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
37e8dcda A |
935 | "jmp 9f \n\t" |
936 | ||
937 | "#.balign 16 \n\t"\ | |
938 | "2: \n\t" | |
0a8d8945 | 939 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) |
37e8dcda | 940 | |
0a8d8945 MN |
941 | #undef IDCT |
942 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
943 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
37e8dcda A |
944 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
945 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ | |
0a8d8945 MN |
946 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
947 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
948 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
949 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
950 | #rounder ", %%mm4 \n\t"\ | |
951 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
952 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
953 | #rounder ", %%mm0 \n\t"\ | |
954 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
955 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
956 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ | |
957 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ | |
958 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
959 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ | |
960 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ | |
961 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ | |
962 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
37e8dcda | 963 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
0a8d8945 MN |
964 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
965 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ | |
966 | "psrad $" #shift ", %%mm7 \n\t"\ | |
37e8dcda | 967 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
968 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
969 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
970 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
971 | "psrad $" #shift ", %%mm0 \n\t"\ | |
972 | "psrad $" #shift ", %%mm2 \n\t"\ | |
973 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
974 | "movd %%mm7, " #dst " \n\t"\ | |
975 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
976 | "movd %%mm0, 16+" #dst " \n\t"\ | |
977 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ | |
978 | "movd %%mm2, 96+" #dst " \n\t"\ | |
979 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
980 | "movd %%mm4, 112+" #dst " \n\t"\ | |
981 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ | |
982 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
983 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
984 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ | |
985 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
37e8dcda | 986 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
0a8d8945 MN |
987 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
988 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ | |
989 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ | |
990 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ | |
991 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
992 | "psrad $" #shift ", %%mm2 \n\t"\ | |
993 | "psrad $" #shift ", %%mm5 \n\t"\ | |
994 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
995 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ | |
996 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
997 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
37e8dcda A |
998 | "psrad $" #shift ", %%mm6 \n\t"\ |
999 | "psrad $" #shift ", %%mm4 \n\t"\ | |
0a8d8945 MN |
1000 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
1001 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1002 | "movd %%mm2, 32+" #dst " \n\t"\ | |
1003 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1004 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1005 | "movd %%mm6, 48+" #dst " \n\t"\ | |
1006 | "movd %%mm4, 64+" #dst " \n\t"\ | |
1007 | "movd %%mm5, 80+" #dst " \n\t" | |
1008 | ||
1009 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
1010 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1011 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1012 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1013 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
37e8dcda A |
1014 | "jmp 9f \n\t" |
1015 | ||
1016 | "#.balign 16 \n\t"\ | |
1017 | "3: \n\t" | |
0a8d8945 MN |
1018 | #undef IDCT |
1019 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1020 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
37e8dcda | 1021 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
0a8d8945 MN |
1022 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1023 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1024 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1025 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1026 | #rounder ", %%mm4 \n\t"\ | |
1027 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1028 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1029 | #rounder ", %%mm0 \n\t"\ | |
1030 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
1031 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1032 | "movq 64(%2), %%mm3 \n\t"\ | |
1033 | "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
1034 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
37e8dcda | 1035 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
0a8d8945 MN |
1036 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1037 | "psrad $" #shift ", %%mm7 \n\t"\ | |
37e8dcda | 1038 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
1039 | "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
1040 | "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1041 | "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
1042 | "psrad $" #shift ", %%mm0 \n\t"\ | |
1043 | "psrad $" #shift ", %%mm1 \n\t"\ | |
1044 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1045 | "movd %%mm7, " #dst " \n\t"\ | |
1046 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1047 | "movd %%mm0, 16+" #dst " \n\t"\ | |
1048 | "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ | |
1049 | "movd %%mm1, 96+" #dst " \n\t"\ | |
1050 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1051 | "movd %%mm4, 112+" #dst " \n\t"\ | |
1052 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1053 | "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1054 | "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1055 | "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ | |
1056 | "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ | |
1057 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1058 | "psrad $" #shift ", %%mm1 \n\t"\ | |
37e8dcda | 1059 | "psrad $" #shift ", %%mm5 \n\t"\ |
0a8d8945 MN |
1060 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
1061 | "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1062 | "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
37e8dcda A |
1063 | "psrad $" #shift ", %%mm6 \n\t"\ |
1064 | "psrad $" #shift ", %%mm4 \n\t"\ | |
0a8d8945 MN |
1065 | "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
1066 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1067 | "movd %%mm1, 32+" #dst " \n\t"\ | |
1068 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1069 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1070 | "movd %%mm6, 48+" #dst " \n\t"\ | |
1071 | "movd %%mm4, 64+" #dst " \n\t"\ | |
1072 | "movd %%mm5, 80+" #dst " \n\t" | |
1073 | ||
1074 | ||
1075 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
1076 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1077 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1078 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1079 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
37e8dcda A |
1080 | "jmp 9f \n\t" |
1081 | ||
1082 | "#.balign 16 \n\t"\ | |
1083 | "5: \n\t" | |
0a8d8945 MN |
1084 | #undef IDCT |
1085 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1086 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1087 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
1088 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1089 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1090 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1091 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1092 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1093 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1094 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1095 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1096 | #rounder ", %%mm4 \n\t"\ | |
1097 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
37e8dcda | 1098 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
0a8d8945 MN |
1099 | #rounder ", %%mm0 \n\t"\ |
1100 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ | |
1101 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1102 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1103 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1104 | "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ | |
1105 | "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ | |
1106 | "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1107 | "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1108 | "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1109 | "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1110 | "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1111 | "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1112 | "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1113 | #rounder ", %%mm1 \n\t"\ | |
1114 | "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ | |
1115 | "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ | |
1116 | #rounder ", %%mm2 \n\t"\ | |
1117 | "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ | |
1118 | "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ | |
1119 | "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ | |
1120 | "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ | |
37e8dcda | 1121 | "psrad $" #shift ", %%mm4 \n\t"\ |
37e8dcda | 1122 | "psrad $" #shift ", %%mm7 \n\t"\ |
0a8d8945 MN |
1123 | "psrad $" #shift ", %%mm3 \n\t"\ |
1124 | "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ | |
1125 | "movq %%mm4, " #dst " \n\t"\ | |
37e8dcda | 1126 | "psrad $" #shift ", %%mm0 \n\t"\ |
0a8d8945 MN |
1127 | "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ |
1128 | "movq %%mm0, 16+" #dst " \n\t"\ | |
1129 | "movq %%mm0, 96+" #dst " \n\t"\ | |
1130 | "movq %%mm4, 112+" #dst " \n\t"\ | |
1131 | "psrad $" #shift ", %%mm5 \n\t"\ | |
1132 | "psrad $" #shift ", %%mm6 \n\t"\ | |
37e8dcda | 1133 | "psrad $" #shift ", %%mm2 \n\t"\ |
0a8d8945 MN |
1134 | "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
1135 | "movq %%mm5, 32+" #dst " \n\t"\ | |
1136 | "psrad $" #shift ", %%mm1 \n\t"\ | |
1137 | "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1138 | "movq %%mm6, 48+" #dst " \n\t"\ | |
1139 | "movq %%mm6, 64+" #dst " \n\t"\ | |
115329f1 DB |
1140 | "movq %%mm5, 80+" #dst " \n\t" |
1141 | ||
0a8d8945 MN |
1142 | |
1143 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
1144 | IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1145 | //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1146 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1147 | //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
37e8dcda A |
1148 | "jmp 9f \n\t" |
1149 | ||
1150 | ||
1151 | "#.balign 16 \n\t"\ | |
1152 | "1: \n\t" | |
0a8d8945 MN |
1153 | #undef IDCT |
1154 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1155 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1156 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ | |
37e8dcda | 1157 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
0a8d8945 MN |
1158 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
1159 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1160 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1161 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1162 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ | |
1163 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ | |
1164 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ | |
1165 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ | |
1166 | #rounder ", %%mm4 \n\t"\ | |
1167 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1168 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ | |
1169 | #rounder ", %%mm0 \n\t"\ | |
1170 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ | |
37e8dcda | 1171 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
0a8d8945 MN |
1172 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
1173 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1174 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ | |
1175 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ | |
1176 | "movq 64(%2), %%mm1 \n\t"\ | |
1177 | "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ | |
1178 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
37e8dcda | 1179 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
0a8d8945 MN |
1180 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
1181 | "psrad $" #shift ", %%mm7 \n\t"\ | |
37e8dcda | 1182 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
1183 | "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ |
1184 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1185 | "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1186 | "psrad $" #shift ", %%mm0 \n\t"\ | |
1187 | "psrad $" #shift ", %%mm3 \n\t"\ | |
1188 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ | |
1189 | "movd %%mm7, " #dst " \n\t"\ | |
1190 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ | |
1191 | "movd %%mm0, 16+" #dst " \n\t"\ | |
1192 | "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ | |
1193 | "movd %%mm3, 96+" #dst " \n\t"\ | |
1194 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ | |
1195 | "movd %%mm4, 112+" #dst " \n\t"\ | |
1196 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ | |
1197 | "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ | |
1198 | "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ | |
1199 | "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ | |
1200 | "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ | |
1201 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ | |
1202 | "psrad $" #shift ", %%mm3 \n\t"\ | |
1203 | "psrad $" #shift ", %%mm5 \n\t"\ | |
1204 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ | |
1205 | "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ | |
1206 | "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ | |
37e8dcda | 1207 | "psrad $" #shift ", %%mm6 \n\t"\ |
0a8d8945 MN |
1208 | "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
1209 | "movd %%mm3, 32+" #dst " \n\t"\ | |
37e8dcda | 1210 | "psrad $" #shift ", %%mm4 \n\t"\ |
0a8d8945 MN |
1211 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
1212 | "movd %%mm6, 48+" #dst " \n\t"\ | |
1213 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ | |
1214 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ | |
1215 | "movd %%mm4, 64+" #dst " \n\t"\ | |
1216 | "movd %%mm5, 80+" #dst " \n\t" | |
115329f1 | 1217 | |
0a8d8945 MN |
1218 | |
1219 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
1220 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1221 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1222 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1223 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
37e8dcda A |
1224 | "jmp 9f \n\t" |
1225 | ||
1226 | ||
1227 | "#.balign 16 \n\t" | |
1228 | "7: \n\t" | |
0a8d8945 MN |
1229 | #undef IDCT |
1230 | #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ | |
1231 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ | |
1232 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ | |
1233 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1234 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ | |
1235 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1236 | #rounder ", %%mm4 \n\t"\ | |
1237 | #rounder ", %%mm0 \n\t"\ | |
37e8dcda | 1238 | "psrad $" #shift ", %%mm4 \n\t"\ |
37e8dcda | 1239 | "psrad $" #shift ", %%mm0 \n\t"\ |
0a8d8945 MN |
1240 | "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ |
1241 | "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ | |
1242 | "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ | |
1243 | "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ | |
1244 | "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ | |
1245 | "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ | |
1246 | #rounder ", %%mm1 \n\t"\ | |
1247 | #rounder ", %%mm2 \n\t"\ | |
37e8dcda | 1248 | "psrad $" #shift ", %%mm1 \n\t"\ |
0a8d8945 MN |
1249 | "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ |
1250 | "movq %%mm4, " #dst " \n\t"\ | |
1251 | "psrad $" #shift ", %%mm2 \n\t"\ | |
1252 | "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ | |
1253 | "movq %%mm0, 16+" #dst " \n\t"\ | |
1254 | "movq %%mm0, 96+" #dst " \n\t"\ | |
1255 | "movq %%mm4, 112+" #dst " \n\t"\ | |
1256 | "movq %%mm0, 32+" #dst " \n\t"\ | |
1257 | "movq %%mm4, 48+" #dst " \n\t"\ | |
1258 | "movq %%mm4, 64+" #dst " \n\t"\ | |
115329f1 | 1259 | "movq %%mm0, 80+" #dst " \n\t" |
0a8d8945 MN |
1260 | |
1261 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) | |
1262 | IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) | |
1263 | //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) | |
1264 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) | |
1265 | //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) | |
37e8dcda A |
1266 | |
1267 | ||
1268 | #endif | |
1269 | ||
1270 | /* | |
1271 | Input | |
0a8d8945 MN |
1272 | 00 40 04 44 20 60 24 64 |
1273 | 10 30 14 34 50 70 54 74 | |
1274 | 01 41 03 43 21 61 23 63 | |
37e8dcda | 1275 | 11 31 13 33 51 71 53 73 |
0a8d8945 MN |
1276 | 02 42 06 46 22 62 26 66 |
1277 | 12 32 16 36 52 72 56 76 | |
1278 | 05 45 07 47 25 65 27 67 | |
1279 | 15 35 17 37 55 75 57 77 | |
115329f1 | 1280 | |
37e8dcda | 1281 | Temp |
0a8d8945 MN |
1282 | 00 04 10 14 20 24 30 34 |
1283 | 40 44 50 54 60 64 70 74 | |
37e8dcda A |
1284 | 01 03 11 13 21 23 31 33 |
1285 | 41 43 51 53 61 63 71 73 | |
0a8d8945 MN |
1286 | 02 06 12 16 22 26 32 36 |
1287 | 42 46 52 56 62 66 72 76 | |
37e8dcda A |
1288 | 05 07 15 17 25 27 35 37 |
1289 | 45 47 55 57 65 67 75 77 | |
1290 | */ | |
1291 | ||
37e8dcda A |
1292 | "9: \n\t" |
1293 | :: "r" (block), "r" (temp), "r" (coeffs) | |
1294 | : "%eax" | |
1295 | ); | |
37e8dcda A |
1296 | } |
1297 | ||
2ad1516a | 1298 | void ff_simple_idct_mmx(int16_t *block) |
37e8dcda | 1299 | { |
2ad1516a MN |
1300 | idct(block); |
1301 | } | |
1302 | ||
1303 | //FIXME merge add/put into the idct | |
1304 | ||
0c1a9eda | 1305 | void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
2ad1516a MN |
1306 | { |
1307 | idct(block); | |
ec7e0bf0 | 1308 | put_pixels_clamped_mmx(block, dest, line_size); |
2ad1516a | 1309 | } |
0c1a9eda | 1310 | void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) |
2ad1516a MN |
1311 | { |
1312 | idct(block); | |
ec7e0bf0 | 1313 | add_pixels_clamped_mmx(block, dest, line_size); |
37e8dcda | 1314 | } |