correct MMX-optimized variant of VP3 IDCT, with comments (thank you
[libav.git] / libavcodec / i386 / vp3dsp_mmx.c
1 /*
2 * Copyright (C) 2004 the ffmpeg project
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19 /**
20 * @file vp3dsp_mmx.c
21 * MMX-optimized functions cribbed from the original VP3 source code.
22 */
23
24 #include "../dsputil.h"
25 #include "mmx.h"
26
27 #define IdctAdjustBeforeShift 8
28
29 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
30 * idct_constants[0..15] = Mask table (M(I))
31 * idct_constants[16..43] = Cosine table (C(I))
32 * idct_constants[44..47] = 8
33 */
34 static uint16_t idct_constants[(4 + 7 + 1) * 4];
35 static uint16_t idct_cosine_table[7] = {
36 64277, 60547, 54491, 46341, 36410, 25080, 12785
37 };
38
39 #define r0 mm0
40 #define r1 mm1
41 #define r2 mm2
42 #define r3 mm3
43 #define r4 mm4
44 #define r5 mm5
45 #define r6 mm6
46 #define r7 mm7
47
48 /* from original comments: The Macro does IDct on 4 1-D Dcts */
49 #define BeginIDCT() { \
50 movq_m2r(*I(3), r2); \
51 movq_m2r(*C(3), r6); \
52 movq_r2r(r2, r4); \
53 movq_m2r(*J(5), r7); \
54 pmulhw_r2r(r6, r4); /* r4 = c3*i3 - i3 */ \
55 movq_m2r(*C(5), r1); \
56 pmulhw_r2r(r7, r6); /* r6 = c3*i5 - i5 */ \
57 movq_r2r(r1, r5); \
58 pmulhw_r2r(r2, r1); /* r1 = c5*i3 - i3 */ \
59 movq_m2r(*I(1), r3); \
60 pmulhw_r2r(r7, r5); /* r5 = c5*i5 - i5 */ \
61 movq_m2r(*C(1), r0); /* (all registers are in use) */ \
62 paddw_r2r(r2, r4); /* r4 = c3*i3 */ \
63 paddw_r2r(r7, r6); /* r6 = c3*i5 */ \
64 paddw_r2r(r1, r2); /* r2 = c5*i3 */ \
65 movq_m2r(*J(7), r1); \
66 paddw_r2r(r5, r7); /* r7 = c5*i5 */ \
67 movq_r2r(r0, r5); /* r5 = c1 */ \
68 pmulhw_r2r(r3, r0); /* r0 = c1*i1 - i1 */ \
69 paddsw_r2r(r7, r4); /* r4 = C = c3*i3 + c5*i5 */ \
70 pmulhw_r2r(r1, r5); /* r5 = c1*i7 - i7 */ \
71 movq_m2r(*C(7), r7); \
72 psubsw_r2r(r2, r6); /* r6 = D = c3*i5 - c5*i3 */ \
73 paddw_r2r(r3, r0); /* r0 = c1*i1 */ \
74 pmulhw_r2r(r7, r3); /* r3 = c7*i1 */ \
75 movq_m2r(*I(2), r2); \
76 pmulhw_r2r(r1, r7); /* r7 = c7*i7 */ \
77 paddw_r2r(r1, r5); /* r5 = c1*i7 */ \
78 movq_r2r(r2, r1); /* r1 = i2 */ \
79 pmulhw_m2r(*C(2), r2); /* r2 = c2*i2 - i2 */ \
80 psubsw_r2r(r5, r3); /* r3 = B = c7*i1 - c1*i7 */ \
81 movq_m2r(*J(6), r5); \
82 paddsw_r2r(r7, r0); /* r0 = A = c1*i1 + c7*i7 */ \
83 movq_r2r(r5, r7); /* r7 = i6 */ \
84 psubsw_r2r(r4, r0); /* r0 = A - C */ \
85 pmulhw_m2r(*C(2), r5); /* r5 = c2*i6 - i6 */ \
86 paddw_r2r(r1, r2); /* r2 = c2*i2 */ \
87 pmulhw_m2r(*C(6), r1); /* r1 = c6*i2 */ \
88 paddsw_r2r(r4, r4); /* r4 = C + C */ \
89 paddsw_r2r(r0, r4); /* r4 = C. = A + C */ \
90 psubsw_r2r(r6, r3); /* r3 = B - D */ \
91 paddw_r2r(r7, r5); /* r5 = c2*i6 */ \
92 paddsw_r2r(r6, r6); /* r6 = D + D */ \
93 pmulhw_m2r(*C(6), r7); /* r7 = c6*i6 */ \
94 paddsw_r2r(r3, r6); /* r6 = D. = B + D */ \
95 movq_r2m(r4, *I(1)); /* save C. at I(1) */ \
96 psubsw_r2r(r5, r1); /* r1 = H = c6*i2 - c2*i6 */ \
97 movq_m2r(*C(4), r4); \
98 movq_r2r(r3, r5); /* r5 = B - D */ \
99 pmulhw_r2r(r4, r3); /* r3 = (c4 - 1) * (B - D) */ \
100 paddsw_r2r(r2, r7); /* r7 = G = c6*i6 + c2*i2 */ \
101 movq_r2m(r6, *I(2)); /* save D. at I(2) */ \
102 movq_r2r(r0, r2); /* r2 = A - C */ \
103 movq_m2r(*I(0), r6); \
104 pmulhw_r2r(r4, r0); /* r0 = (c4 - 1) * (A - C) */ \
105 paddw_r2r(r3, r5); /* r5 = B. = c4 * (B - D) */ \
106 movq_m2r(*J(4), r3); \
107 psubsw_r2r(r1, r5); /* r5 = B.. = B. - H */ \
108 paddw_r2r(r0, r2); /* r0 = A. = c4 * (A - C) */ \
109 psubsw_r2r(r3, r6); /* r6 = i0 - i4 */ \
110 movq_r2r(r6, r0); \
111 pmulhw_r2r(r4, r6); /* r6 = (c4 - 1) * (i0 - i4) */ \
112 paddsw_r2r(r3, r3); /* r3 = i4 + i4 */ \
113 paddsw_r2r(r1, r1); /* r1 = H + H */ \
114 paddsw_r2r(r0, r3); /* r3 = i0 + i4 */ \
115 paddsw_r2r(r5, r1); /* r1 = H. = B + H */ \
116 pmulhw_r2r(r3, r4); /* r4 = (c4 - 1) * (i0 + i4) */ \
117 paddsw_r2r(r0, r6); /* r6 = F = c4 * (i0 - i4) */ \
118 psubsw_r2r(r2, r6); /* r6 = F. = F - A. */ \
119 paddsw_r2r(r2, r2); /* r2 = A. + A. */ \
120 movq_m2r(*I(1), r0); /* r0 = C. */ \
121 paddsw_r2r(r6, r2); /* r2 = A.. = F + A. */ \
122 paddw_r2r(r3, r4); /* r4 = E = c4 * (i0 + i4) */ \
123 psubsw_r2r(r1, r2); /* r2 = R2 = A.. - H. */ \
124 }
125
126 /* RowIDCT gets ready to transpose */
127 #define RowIDCT() { \
128 \
129 BeginIDCT(); \
130 \
131 movq_m2r(*I(2), r3); /* r3 = D. */ \
132 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
133 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
134 paddsw_r2r(r7, r7); /* r7 = G + G */ \
135 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
136 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
137 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
138 paddsw_r2r(r3, r3); \
139 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
140 paddsw_r2r(r5, r5); \
141 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
142 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
143 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
144 paddsw_r2r(r0, r0); \
145 movq_r2m(r1, *I(1)); /* save R1 */ \
146 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
147 }
148
149 /* Column IDCT normalizes and stores final results */
150 #define ColumnIDCT() { \
151 \
152 BeginIDCT(); \
153 \
154 paddsw_m2r(*Eight, r2); /* adjust R2 (and R1) for shift */ \
155 paddsw_r2r(r1, r1); /* r1 = H. + H. */ \
156 paddsw_r2r(r2, r1); /* r1 = R1 = A.. + H. */ \
157 psraw_i2r(4, r2); /* r2 = NR2 */ \
158 psubsw_r2r(r7, r4); /* r4 = E. = E - G */ \
159 psraw_i2r(4, r1); /* r1 = NR1 */ \
160 movq_m2r(*I(2), r3); /* r3 = D. */ \
161 paddsw_r2r(r7, r7); /* r7 = G + G */ \
162 movq_r2m(r2, *I(2)); /* store NR2 at I2 */ \
163 paddsw_r2r(r4, r7); /* r7 = G. = E + G */ \
164 movq_r2m(r1, *I(1)); /* store NR1 at I1 */ \
165 psubsw_r2r(r3, r4); /* r4 = R4 = E. - D. */ \
166 paddsw_m2r(*Eight, r4); /* adjust R4 (and R3) for shift */ \
167 paddsw_r2r(r3, r3); /* r3 = D. + D. */ \
168 paddsw_r2r(r4, r3); /* r3 = R3 = E. + D. */ \
169 psraw_i2r(4, r4); /* r4 = NR4 */ \
170 psubsw_r2r(r5, r6); /* r6 = R6 = F. - B.. */ \
171 psraw_i2r(4, r3); /* r3 = NR3 */ \
172 paddsw_m2r(*Eight, r6); /* adjust R6 (and R5) for shift */ \
173 paddsw_r2r(r5, r5); /* r5 = B.. + B.. */ \
174 paddsw_r2r(r6, r5); /* r5 = R5 = F. + B.. */ \
175 psraw_i2r(4, r6); /* r6 = NR6 */ \
176 movq_r2m(r4, *J(4)); /* store NR4 at J4 */ \
177 psraw_i2r(4, r5); /* r5 = NR5 */ \
178 movq_r2m(r3, *I(3)); /* store NR3 at I3 */ \
179 psubsw_r2r(r0, r7); /* r7 = R7 = G. - C. */ \
180 paddsw_m2r(*Eight, r7); /* adjust R7 (and R0) for shift */ \
181 paddsw_r2r(r0, r0); /* r0 = C. + C. */ \
182 paddsw_r2r(r7, r0); /* r0 = R0 = G. + C. */ \
183 psraw_i2r(4, r7); /* r7 = NR7 */ \
184 movq_r2m(r6, *J(6)); /* store NR6 at J6 */ \
185 psraw_i2r(4, r0); /* r0 = NR0 */ \
186 movq_r2m(r5, *J(5)); /* store NR5 at J5 */ \
187 movq_r2m(r7, *J(7)); /* store NR7 at J7 */ \
188 movq_r2m(r0, *I(0)); /* store NR0 at I0 */ \
189 }
190
191 /* Following macro does two 4x4 transposes in place.
192
193 At entry (we assume):
194
195 r0 = a3 a2 a1 a0
196 I(1) = b3 b2 b1 b0
197 r2 = c3 c2 c1 c0
198 r3 = d3 d2 d1 d0
199
200 r4 = e3 e2 e1 e0
201 r5 = f3 f2 f1 f0
202 r6 = g3 g2 g1 g0
203 r7 = h3 h2 h1 h0
204
205 At exit, we have:
206
207 I(0) = d0 c0 b0 a0
208 I(1) = d1 c1 b1 a1
209 I(2) = d2 c2 b2 a2
210 I(3) = d3 c3 b3 a3
211
212 J(4) = h0 g0 f0 e0
213 J(5) = h1 g1 f1 e1
214 J(6) = h2 g2 f2 e2
215 J(7) = h3 g3 f3 e3
216
217 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
218 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
219
220 Since r1 is free at entry, we calculate the Js first. */
221
222 #define Transpose() { \
223 movq_r2r(r4, r1); /* r1 = e3 e2 e1 e0 */ \
224 punpcklwd_r2r(r5, r4); /* r4 = f1 e1 f0 e0 */ \
225 movq_r2m(r0, *I(0)); /* save a3 a2 a1 a0 */ \
226 punpckhwd_r2r(r5, r1); /* r1 = f3 e3 f2 e2 */ \
227 movq_r2r(r6, r0); /* r0 = g3 g2 g1 g0 */ \
228 punpcklwd_r2r(r7, r6); /* r6 = h1 g1 h0 g0 */ \
229 movq_r2r(r4, r5); /* r5 = f1 e1 f0 e0 */ \
230 punpckldq_r2r(r6, r4); /* r4 = h0 g0 f0 e0 = R4 */ \
231 punpckhdq_r2r(r6, r5); /* r5 = h1 g1 f1 e1 = R5 */ \
232 movq_r2r(r1, r6); /* r6 = f3 e3 f2 e2 */ \
233 movq_r2m(r4, *J(4)); \
234 punpckhwd_r2r(r7, r0); /* r0 = h3 g3 h2 g2 */ \
235 movq_r2m(r5, *J(5)); \
236 punpckhdq_r2r(r0, r6); /* r6 = h3 g3 f3 e3 = R7 */ \
237 movq_m2r(*I(0), r4); /* r4 = a3 a2 a1 a0 */ \
238 punpckldq_r2r(r0, r1); /* r1 = h2 g2 f2 e2 = R6 */ \
239 movq_m2r(*I(1), r5); /* r5 = b3 b2 b1 b0 */ \
240 movq_r2r(r4, r0); /* r0 = a3 a2 a1 a0 */ \
241 movq_r2m(r6, *J(7)); \
242 punpcklwd_r2r(r5, r0); /* r0 = b1 a1 b0 a0 */ \
243 movq_r2m(r1, *J(6)); \
244 punpckhwd_r2r(r5, r4); /* r4 = b3 a3 b2 a2 */ \
245 movq_r2r(r2, r5); /* r5 = c3 c2 c1 c0 */ \
246 punpcklwd_r2r(r3, r2); /* r2 = d1 c1 d0 c0 */ \
247 movq_r2r(r0, r1); /* r1 = b1 a1 b0 a0 */ \
248 punpckldq_r2r(r2, r0); /* r0 = d0 c0 b0 a0 = R0 */ \
249 punpckhdq_r2r(r2, r1); /* r1 = d1 c1 b1 a1 = R1 */ \
250 movq_r2r(r4, r2); /* r2 = b3 a3 b2 a2 */ \
251 movq_r2m(r0, *I(0)); \
252 punpckhwd_r2r(r3, r5); /* r5 = d3 c3 d2 c2 */ \
253 movq_r2m(r1, *I(1)); \
254 punpckhdq_r2r(r5, r4); /* r4 = d3 c3 b3 a3 = R3 */ \
255 punpckldq_r2r(r5, r2); /* r2 = d2 c2 b2 a2 = R2 */ \
256 movq_r2m(r4, *I(3)); \
257 movq_r2m(r2, *I(2)); \
258 }
259
260 void vp3_dsp_init_mmx(void)
261 {
262 int j = 16;
263 uint16_t *p;
264
265 do {
266 idct_constants[--j] = 0;
267 } while (j);
268
269 idct_constants[0] = idct_constants[5] =
270 idct_constants[10] = idct_constants[15] = 65535;
271
272 j = 1;
273 do {
274 p = idct_constants + ((j + 3) << 2);
275 p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
276 } while (++j <= 7);
277
278 idct_constants[44] = idct_constants[45] =
279 idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
280 }
281
282 static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
283 int16_t *output_data)
284 {
285 /* eax = quantized input
286 * ebx = dequantizer matrix
287 * ecx = IDCT constants
288 * M(I) = ecx + MaskOffset(0) + I * 8
289 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
290 * edx = output
291 * r0..r7 = mm0..mm7
292 */
293
294 #define M(x) (idct_constants + x * 4)
295 #define C(x) (idct_constants + 16 + (x - 1) * 4)
296 #define Eight (idct_constants + 44)
297
298 unsigned char *input_bytes = (unsigned char *)input_data;
299 unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
300 unsigned char *output_data_bytes = (unsigned char *)output_data;
301
302 movq_m2r(*(input_bytes), r0);
303 pmullw_m2r(*(dequant_matrix_bytes), r0); /* r0 = 03 02 01 00 */
304 movq_m2r(*(input_bytes+16), r1);
305 pmullw_m2r(*(dequant_matrix_bytes+16), r1); /* r1 = 13 12 11 10 */
306 movq_m2r(*M(0), r2); /* r2 = __ __ __ FF */
307 movq_r2r(r0, r3); /* r3 = 03 02 01 00 */
308 movq_m2r(*(input_bytes+8), r4);
309 psrlq_i2r(16, r0); /* r0 = __ 03 02 01 */
310 pmullw_m2r(*(dequant_matrix_bytes+8), r4); /* r4 = 07 06 05 04 */
311 pand_r2r(r2, r3); /* r3 = __ __ __ 00 */
312 movq_r2r(r0, r5); /* r5 = __ 03 02 01 */
313 movq_r2r(r1, r6); /* r6 = 13 12 11 10 */
314 pand_r2r(r2, r5); /* r5 = __ __ __ 01 */
315 psllq_i2r(32, r6); /* r6 = 11 10 __ __ */
316 movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
317 pxor_r2r(r5, r0); /* r0 = __ 03 02 __ */
318 pand_r2r(r6, r7); /* r7 = 11 __ __ __ */
319 por_r2r(r3, r0); /* r0 = __ 03 02 00 */
320 pxor_r2r(r7, r6); /* r6 = __ 10 __ __ */
321 por_r2r(r7, r0); /* r0 = 11 03 02 00 = R0 */
322 movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
323 movq_r2r(r4, r3); /* r3 = 07 06 05 04 */
324 movq_r2m(r0, *(output_data_bytes)); /* write R0 = r0 */
325 pand_r2r(r2, r3); /* r3 = __ __ __ 04 */
326 movq_m2r(*(input_bytes+32), r0);
327 psllq_i2r(16, r3); /* r3 = __ __ 04 __ */
328 pmullw_m2r(*(dequant_matrix_bytes+32), r0); /* r0 = 23 22 21 20 */
329 pand_r2r(r1, r7); /* r7 = 13 __ __ __ */
330 por_r2r(r3, r5); /* r5 = __ __ 04 01 */
331 por_r2r(r6, r7); /* r7 = 13 10 __ __ */
332 movq_m2r(*(input_bytes+24), r3);
333 por_r2r(r5, r7); /* r7 = 13 10 04 01 = R1 */
334 pmullw_m2r(*(dequant_matrix_bytes+24), r3); /* r3 = 17 16 15 14 */
335 psrlq_i2r(16, r4); /* r4 = __ 07 06 05 */
336 movq_r2m(r7, *(output_data_bytes+16)); /* write R1 = r7 */
337 movq_r2r(r4, r5); /* r5 = __ 07 06 05 */
338 movq_r2r(r0, r7); /* r7 = 23 22 21 20 */
339 psrlq_i2r(16, r4); /* r4 = __ __ 07 06 */
340 psrlq_i2r(48, r7); /* r7 = __ __ __ 23 */
341 movq_r2r(r2, r6); /* r6 = __ __ __ FF */
342 pand_r2r(r2, r5); /* r5 = __ __ __ 05 */
343 pand_r2r(r4, r6); /* r6 = __ __ __ 06 */
344 movq_r2m(r7, *(output_data_bytes+80)); /* partial R9 = __ __ __ 23 */
345 pxor_r2r(r6, r4); /* r4 = __ __ 07 __ */
346 psrlq_i2r(32, r1); /* r1 = __ __ 13 12 */
347 por_r2r(r5, r4); /* r4 = __ __ 07 05 */
348 movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
349 pand_r2r(r2, r1); /* r1 = __ __ __ 12 */
350 movq_m2r(*(input_bytes+48), r5);
351 psllq_i2r(16, r0); /* r0 = 22 21 20 __ */
352 pmullw_m2r(*(dequant_matrix_bytes+48), r5); /* r5 = 33 32 31 30 */
353 pand_r2r(r0, r7); /* r7 = 22 __ __ __ */
354 movq_r2m(r1, *(output_data_bytes+64)); /* partial R8 = __ __ __ 12 */
355 por_r2r(r4, r7); /* r7 = 22 __ 07 05 */
356 movq_r2r(r3, r4); /* r4 = 17 16 15 14 */
357 pand_r2r(r2, r3); /* r3 = __ __ __ 14 */
358 movq_m2r(*M(2), r1); /* r1 = __ FF __ __ */
359 psllq_i2r(32, r3); /* r3 = __ 14 __ __ */
360 por_r2r(r3, r7); /* r7 = 22 14 07 05 = R2 */
361 movq_r2r(r5, r3); /* r3 = 33 32 31 30 */
362 psllq_i2r(48, r3); /* r3 = 30 __ __ __ */
363 pand_r2r(r0, r1); /* r1 = __ 21 __ __ */
364 movq_r2m(r7, *(output_data_bytes+32)); /* write R2 = r7 */
365 por_r2r(r3, r6); /* r6 = 30 __ __ 06 */
366 movq_m2r(*M(1), r7); /* r7 = __ __ FF __ */
367 por_r2r(r1, r6); /* r6 = 30 21 __ 06 */
368 movq_m2r(*(input_bytes+56), r1);
369 pand_r2r(r4, r7); /* r7 = __ __ 15 __ */
370 pmullw_m2r(*(dequant_matrix_bytes+56), r1); /* r1 = 37 36 35 34 */
371 por_r2r(r6, r7); /* r7 = 30 21 15 06 = R3 */
372 pand_m2r(*M(1), r0); /* r0 = __ __ 20 __ */
373 psrlq_i2r(32, r4); /* r4 = __ __ 17 16 */
374 movq_r2m(r7, *(output_data_bytes+48)); /* write R3 = r7 */
375 movq_r2r(r4, r6); /* r6 = __ __ 17 16 */
376 movq_m2r(*M(3), r7); /* r7 = FF __ __ __ */
377 pand_r2r(r2, r4); /* r4 = __ __ __ 16 */
378 movq_m2r(*M(1), r3); /* r3 = __ __ FF __ */
379 pand_r2r(r1, r7); /* r7 = 37 __ __ __ */
380 pand_r2r(r5, r3); /* r3 = __ __ 31 __ */
381 por_r2r(r4, r0); /* r0 = __ __ 20 16 */
382 psllq_i2r(16, r3); /* r3 = __ 31 __ __ */
383 por_r2r(r0, r7); /* r7 = 37 __ 20 16 */
384 movq_m2r(*M(2), r4); /* r4 = __ FF __ __ */
385 por_r2r(r3, r7); /* r7 = 37 31 20 16 = R4 */
386 movq_m2r(*(input_bytes+80), r0);
387 movq_r2r(r4, r3); /* r3 = __ __ FF __ */
388 pmullw_m2r(*(dequant_matrix_bytes+80), r0); /* r0 = 53 52 51 50 */
389 pand_r2r(r5, r4); /* r4 = __ 32 __ __ */
390 movq_r2m(r7, *(output_data_bytes+8)); /* write R4 = r7 */
391 por_r2r(r4, r6); /* r6 = __ 32 17 16 */
392 movq_r2r(r3, r4); /* r4 = __ FF __ __ */
393 psrlq_i2r(16, r6); /* r6 = __ __ 32 17 */
394 movq_r2r(r0, r7); /* r7 = 53 52 51 50 */
395 pand_r2r(r1, r4); /* r4 = __ 36 __ __ */
396 psllq_i2r(48, r7); /* r7 = 50 __ __ __ */
397 por_r2r(r4, r6); /* r6 = __ 36 32 17 */
398 movq_m2r(*(input_bytes+88), r4);
399 por_r2r(r6, r7); /* r7 = 50 36 32 17 = R5 */
400 pmullw_m2r(*(dequant_matrix_bytes+88), r4); /* r4 = 57 56 55 54 */
401 psrlq_i2r(16, r3); /* r3 = __ __ FF __ */
402 movq_r2m(r7, *(output_data_bytes+24)); /* write R5 = r7 */
403 pand_r2r(r1, r3); /* r3 = __ __ 35 __ */
404 psrlq_i2r(48, r5); /* r5 = __ __ __ 33 */
405 pand_r2r(r2, r1); /* r1 = __ __ __ 34 */
406 movq_m2r(*(input_bytes+104), r6);
407 por_r2r(r3, r5); /* r5 = __ __ 35 33 */
408 pmullw_m2r(*(dequant_matrix_bytes+104), r6); /* r6 = 67 66 65 64 */
409 psrlq_i2r(16, r0); /* r0 = __ 53 52 51 */
410 movq_r2r(r4, r7); /* r7 = 57 56 55 54 */
411 movq_r2r(r2, r3); /* r3 = __ __ __ FF */
412 psllq_i2r(48, r7); /* r7 = 54 __ __ __ */
413 pand_r2r(r0, r3); /* r3 = __ __ __ 51 */
414 pxor_r2r(r3, r0); /* r0 = __ 53 52 __ */
415 psllq_i2r(32, r3); /* r3 = __ 51 __ __ */
416 por_r2r(r5, r7); /* r7 = 54 __ 35 33 */
417 movq_r2r(r6, r5); /* r5 = 67 66 65 64 */
418 pand_m2r(*M(1), r6); /* r6 = __ __ 65 __ */
419 por_r2r(r3, r7); /* r7 = 54 51 35 33 = R6 */
420 psllq_i2r(32, r6); /* r6 = 65 __ __ __ */
421 por_r2r(r1, r0); /* r0 = __ 53 52 34 */
422 movq_r2m(r7, *(output_data_bytes+40)); /* write R6 = r7 */
423 por_r2r(r6, r0); /* r0 = 65 53 52 34 = R7 */
424 movq_m2r(*(input_bytes+120), r7);
425 movq_r2r(r5, r6); /* r6 = 67 66 65 64 */
426 pmullw_m2r(*(dequant_matrix_bytes+120), r7); /* r7 = 77 76 75 74 */
427 psrlq_i2r(32, r5); /* r5 = __ __ 67 66 */
428 pand_r2r(r2, r6); /* r6 = __ __ __ 64 */
429 movq_r2r(r5, r1); /* r1 = __ __ 67 66 */
430 movq_r2m(r0, *(output_data_bytes+56)); /* write R7 = r0 */
431 pand_r2r(r2, r1); /* r1 = __ __ __ 66 */
432 movq_m2r(*(input_bytes+112), r0);
433 movq_r2r(r7, r3); /* r3 = 77 76 75 74 */
434 pmullw_m2r(*(dequant_matrix_bytes+112), r0); /* r0 = 73 72 71 70 */
435 psllq_i2r(16, r3); /* r3 = 76 75 74 __ */
436 pand_m2r(*M(3), r7); /* r7 = 77 __ __ __ */
437 pxor_r2r(r1, r5); /* r5 = __ __ 67 __ */
438 por_r2r(r5, r6); /* r6 = __ __ 67 64 */
439 movq_r2r(r3, r5); /* r5 = 76 75 74 __ */
440 pand_m2r(*M(3), r5); /* r5 = 76 __ __ __ */
441 por_r2r(r1, r7); /* r7 = 77 __ __ 66 */
442 movq_m2r(*(input_bytes+96), r1);
443 pxor_r2r(r5, r3); /* r3 = __ 75 74 __ */
444 pmullw_m2r(*(dequant_matrix_bytes+96), r1); /* r1 = 63 62 61 60 */
445 por_r2r(r3, r7); /* r7 = 77 75 74 66 = R15 */
446 por_r2r(r5, r6); /* r6 = 76 __ 67 64 */
447 movq_r2r(r0, r5); /* r5 = 73 72 71 70 */
448 movq_r2m(r7, *(output_data_bytes+120)); /* store R15 = r7 */
449 psrlq_i2r(16, r5); /* r5 = __ 73 72 71 */
450 pand_m2r(*M(2), r5); /* r5 = __ 73 __ __ */
451 movq_r2r(r0, r7); /* r7 = 73 72 71 70 */
452 por_r2r(r5, r6); /* r6 = 76 73 67 64 = R14 */
453 pand_r2r(r2, r0); /* r0 = __ __ __ 70 */
454 pxor_r2r(r0, r7); /* r7 = 73 72 71 __ */
455 psllq_i2r(32, r0); /* r0 = __ 70 __ __ */
456 movq_r2m(r6, *(output_data_bytes+104)); /* write R14 = r6 */
457 psrlq_i2r(16, r4); /* r4 = __ 57 56 55 */
458 movq_m2r(*(input_bytes+72), r5);
459 psllq_i2r(16, r7); /* r7 = 72 71 __ __ */
460 pmullw_m2r(*(dequant_matrix_bytes+72), r5); /* r5 = 47 46 45 44 */
461 movq_r2r(r7, r6); /* r6 = 72 71 __ __ */
462 movq_m2r(*M(2), r3); /* r3 = __ FF __ __ */
463 psllq_i2r(16, r6); /* r6 = 71 __ __ __ */
464 pand_m2r(*M(3), r7); /* r7 = 72 __ __ __ */
465 pand_r2r(r1, r3); /* r3 = __ 62 __ __ */
466 por_r2r(r0, r7); /* r7 = 72 70 __ __ */
467 movq_r2r(r1, r0); /* r0 = 63 62 61 60 */
468 pand_m2r(*M(3), r1); /* r1 = 63 __ __ __ */
469 por_r2r(r3, r6); /* r6 = 71 62 __ __ */
470 movq_r2r(r4, r3); /* r3 = __ 57 56 55 */
471 psrlq_i2r(32, r1); /* r1 = __ __ 63 __ */
472 pand_r2r(r2, r3); /* r3 = __ __ __ 55 */
473 por_r2r(r1, r7); /* r7 = 72 70 63 __ */
474 por_r2r(r3, r7); /* r7 = 72 70 63 55 = R13 */
475 movq_r2r(r4, r3); /* r3 = __ 57 56 55 */
476 pand_m2r(*M(1), r3); /* r3 = __ __ 56 __ */
477 movq_r2r(r5, r1); /* r1 = 47 46 45 44 */
478 movq_r2m(r7, *(output_data_bytes+88)); /* write R13 = r7 */
479 psrlq_i2r(48, r5); /* r5 = __ __ __ 47 */
480 movq_m2r(*(input_bytes+64), r7);
481 por_r2r(r3, r6); /* r6 = 71 62 56 __ */
482 pmullw_m2r(*(dequant_matrix_bytes+64), r7); /* r7 = 43 42 41 40 */
483 por_r2r(r5, r6); /* r6 = 71 62 56 47 = R12 */
484 pand_m2r(*M(2), r4); /* r4 = __ 57 __ __ */
485 psllq_i2r(32, r0); /* r0 = 61 60 __ __ */
486 movq_r2m(r6, *(output_data_bytes+72)); /* write R12 = r6 */
487 movq_r2r(r0, r6); /* r6 = 61 60 __ __ */
488 pand_m2r(*M(3), r0); /* r0 = 61 __ __ __ */
489 psllq_i2r(16, r6); /* r6 = 60 __ __ __ */
490 movq_m2r(*(input_bytes+40), r5);
491 movq_r2r(r1, r3); /* r3 = 47 46 45 44 */
492 pmullw_m2r(*(dequant_matrix_bytes+40), r5); /* r5 = 27 26 25 24 */
493 psrlq_i2r(16, r1); /* r1 = __ 47 46 45 */
494 pand_m2r(*M(1), r1); /* r1 = __ __ 46 __ */
495 por_r2r(r4, r0); /* r0 = 61 57 __ __ */
496 pand_r2r(r7, r2); /* r2 = __ __ __ 40 */
497 por_r2r(r1, r0); /* r0 = 61 57 46 __ */
498 por_r2r(r2, r0); /* r0 = 61 57 46 40 = R11 */
499 psllq_i2r(16, r3); /* r3 = 46 45 44 __ */
500 movq_r2r(r3, r4); /* r4 = 46 45 44 __ */
501 movq_r2r(r5, r2); /* r2 = 27 26 25 24 */
502 movq_r2m(r0, *(output_data_bytes+112)); /* write R11 = r0 */
503 psrlq_i2r(48, r2); /* r2 = __ __ __ 27 */
504 pand_m2r(*M(2), r4); /* r4 = __ 45 __ __ */
505 por_r2r(r2, r6); /* r6 = 60 __ __ 27 */
506 movq_m2r(*M(1), r2); /* r2 = __ __ FF __ */
507 por_r2r(r4, r6); /* r6 = 60 45 __ 27 */
508 pand_r2r(r7, r2); /* r2 = __ __ 41 __ */
509 psllq_i2r(32, r3); /* r3 = 44 __ __ __ */
510 por_m2r(*(output_data_bytes+80), r3); /* r3 = 44 __ __ 23 */
511 por_r2r(r2, r6); /* r6 = 60 45 41 27 = R10 */
512 movq_m2r(*M(3), r2); /* r2 = FF __ __ __ */
513 psllq_i2r(16, r5); /* r5 = 26 25 24 __ */
514 movq_r2m(r6, *(output_data_bytes+96)); /* store R10 = r6 */
515 pand_r2r(r5, r2); /* r2 = 26 __ __ __ */
516 movq_m2r(*M(2), r6); /* r6 = __ FF __ __ */
517 pxor_r2r(r2, r5); /* r5 = __ 25 24 __ */
518 pand_r2r(r7, r6); /* r6 = __ 42 __ __ */
519 psrlq_i2r(32, r2); /* r2 = __ __ 26 __ */
520 pand_m2r(*M(3), r7); /* r7 = 43 __ __ __ */
521 por_r2r(r2, r3); /* r3 = 44 __ 26 23 */
522 por_m2r(*(output_data_bytes+64), r7); /* r7 = 43 __ __ 12 */
523 por_r2r(r3, r6); /* r6 = 44 42 26 23 = R9 */
524 por_r2r(r5, r7); /* r7 = 43 25 24 12 = R8 */
525 movq_r2m(r6, *(output_data_bytes+80)); /* store R9 = r6 */
526 movq_r2m(r7, *(output_data_bytes+64)); /* store R8 = r7 */
527
528
529 #undef M
530
531 /* at this point, function has completed dequantization + dezigzag +
532 * partial transposition; now do the idct itself */
533
534 #define I(K) (output_data + K * 8)
535 #define J(K) (output_data + ((K - 4) * 8) + 4)
536
537 RowIDCT();
538 Transpose();
539
540 #undef I
541 #undef J
542 #define I(K) (output_data + (K * 8) + 32)
543 #define J(K) (output_data + ((K - 4) * 8) + 36)
544
545 RowIDCT();
546 Transpose();
547
548 #undef I
549 #undef J
550 #define I(K) (output_data + K * 8)
551 #define J(K) (output_data + K * 8)
552
553 ColumnIDCT();
554
555 #undef I
556 #undef J
557 #define I(K) (output_data + (K * 8) + 4)
558 #define J(K) (output_data + (K * 8) + 4)
559
560 ColumnIDCT();
561
562 #undef I
563 #undef J
564
565 }
566
567 void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix,
568 int coeff_count, uint8_t *dest, int stride)
569 {
570 int16_t transformed_data[64];
571 int16_t *op;
572 int i, j;
573 uint8_t vector128[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
574
575 vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
576
577 /* place in final output */
578 op = transformed_data;
579 movq_m2r(*vector128, mm0);
580 for (i = 0; i < 8; i++) {
581 #if 1
582 for (j = 0; j < 8; j++) {
583 if (*op < -128)
584 *dest = 0;
585 else if (*op > 127)
586 *dest = 255;
587 else
588 *dest = (uint8_t)(*op + 128);
589 op++;
590 dest++;
591 }
592 dest += (stride - 8);
593 #else
594 /* prototype optimization */
595 pxor_r2r(mm1, mm1);
596 packsswb_m2r(*(op + 4), mm1);
597 movq_r2r(mm1, mm2);
598 psrlq_i2r(32, mm2);
599 packsswb_m2r(*(op + 0), mm1);
600 op += 8;
601 por_r2r(mm2, mm1);
602 paddb_r2r(mm0, mm1);
603 movq_r2m(mm1, *dest);
604 dest += stride;
605 #endif
606 }
607
608 /* be a good MMX citizen */
609 emms();
610 }
611
612 void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
613 int coeff_count, uint8_t *dest, int stride)
614 {
615 int16_t transformed_data[64];
616 int16_t *op;
617 int i, j;
618 int16_t sample;
619
620 vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
621
622 /* place in final output */
623 op = transformed_data;
624 for (i = 0; i < 8; i++) {
625 for (j = 0; j < 8; j++) {
626 sample = *dest + *op;
627 if (sample < 0)
628 *dest = 0;
629 else if (sample > 255)
630 *dest = 255;
631 else
632 *dest = (uint8_t)(sample & 0xFF);
633 op++;
634 dest++;
635 }
636 dest += (stride - 8);
637 }
638
639 /* be a good MMX citizen */
640 emms();
641 }