2 * Copyright (C) 2004 the ffmpeg project
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 * MMX-optimized functions cribbed from the original VP3 source code.
24 #include "../dsputil.h"
27 #define IdctAdjustBeforeShift 8
29 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
30 * idct_constants[0..15] = Mask table (M(I))
31 * idct_constants[16..43] = Cosine table (C(I))
32 * idct_constants[44..47] = 8
34 static uint16_t idct_constants
[(4 + 7 + 1) * 4];
35 static uint16_t idct_cosine_table
[7] = {
36 64277, 60547, 54491, 46341, 36410, 25080, 12785
48 /* from original comments: The Macro does IDct on 4 1-D Dcts */
50 movq_m2r(*I(3), r2); \
51 movq_m2r(*C(3), r6); \
53 movq_m2r(*J(5), r7); \
55 movq_m2r(*C(5), r1); \
59 movq_m2r(*I(1), r3); \
61 movq_m2r(*C(1), r0); \
65 movq_m2r(*J(7), r1); \
71 movq_m2r(*C(7), r7); \
75 movq_m2r(*I(2), r2); \
79 pmulhw_m2r(*C(2), r2); \
81 movq_m2r(*J(6), r5); \
85 pmulhw_m2r(*C(2), r5); \
87 pmulhw_m2r(*C(6), r1); \
93 pmulhw_m2r(*C(6), r7); \
95 movq_r2m(r4, *I(1)); \
97 movq_m2r(*C(4), r4); \
100 paddsw_r2r(r2, r7); \
101 movq_r2m(r6, *I(2)); \
103 movq_m2r(*I(0), r6); \
104 pmulhw_r2r(r4, r0); \
106 movq_m2r(*J(4), r3); \
107 psubsw_r2r(r1, r5); \
109 psubsw_r2r(r3, r6); \
111 pmulhw_r2r(r4, r6); \
112 paddsw_r2r(r3, r3); \
113 paddsw_r2r(r1, r1); \
114 paddsw_r2r(r0, r3); \
115 paddsw_r2r(r5, r1); \
116 pmulhw_r2r(r3, r4); \
117 paddsw_r2r(r0, r6); \
118 psubsw_r2r(r2, r6); \
119 paddsw_r2r(r2, r2); \
120 movq_m2r(*I(1), r0); \
121 paddsw_r2r(r6, r2); \
125 /* RowIDCT gets ready to transpose */
130 movq_m2r(*I(2), r3); \
131 psubsw_r2r(r7, r4); \
132 paddsw_r2r(r1, r1); \
133 paddsw_r2r(r7, r7); \
134 paddsw_r2r(r2, r1); \
135 paddsw_r2r(r4, r7); \
136 psubsw_r2r(r3, r4); \
137 psubsw_r2r(r5, r6); \
138 paddsw_r2r(r5, r5); \
139 paddsw_r2r(r4, r3); \
140 paddsw_r2r(r6, r5); \
141 psubsw_r2r(r0, r7); \
142 paddsw_r2r(r0, r0); \
143 movq_r2m(r1, *I(1)); \
146 /* Column IDCT normalizes and stores final results */
147 #define ColumnIDCT() \
151 paddsw_m2r(*Eight, r2); \
152 paddsw_r2r(r1, r1); \
153 paddsw_r2r(r2, r1); \
155 psubsw_r2r(r7, r4); \
157 movq_m2r(*I(2), r3); \
158 paddsw_r2r(r7, r7); \
159 movq_r2m(r2, *I(2)); \
160 paddsw_r2r(r4, r7); \
161 movq_r2m(r1, *I(1)); \
162 psubsw_r2r(r3, r4); \
163 paddsw_m2r(*Eight, r4); \
164 paddsw_r2r(r3, r3); \
165 paddsw_r2r(r4, r3); \
167 psubsw_r2r(r5, r6); \
169 paddsw_m2r(*Eight, r6); \
170 paddsw_r2r(r5, r5); \
171 paddsw_r2r(r6, r5); \
173 movq_r2m(r4, *J(4)); \
175 movq_r2m(r3, *I(3)); \
176 psubsw_r2r(r0, r7); \
177 paddsw_m2r(*Eight, r7); \
178 paddsw_r2r(r0, r0); \
179 paddsw_r2r(r7, r0); \
181 movq_r2m(r6, *J(6)); \
183 movq_r2m(r5, *J(5)); \
184 movq_r2m(r7, *J(7)); \
188 /* Following macro does two 4x4 transposes in place.
190 At entry (we assume):
214 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
215 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
217 Since r1 is free at entry, we calculate the Js first. */
219 #define Transpose() \
221 punpcklwd_r2r(r5, r4); \
222 movq_r2m(r0, *I(0)); \
223 punpckhwd_r2r(r5, r1); \
225 punpcklwd_r2r(r7, r6); \
227 punpckldq_r2r(r6, r4); \
228 punpckhdq_r2r(r6, r5); \
230 movq_r2m(r4, *J(4)); \
231 punpckhwd_r2r(r7, r0); \
232 movq_r2m(r5, *J(5)); \
233 punpckhdq_r2r(r0, r6); \
234 movq_m2r(*I(0), r4); \
235 punpckldq_r2r(r0, r1); \
236 movq_m2r(*I(1), r5); \
238 movq_r2m(r6, *J(7)); \
239 punpcklwd_r2r(r5, r0); \
240 movq_r2m(r1, *J(6)); \
241 punpckhwd_r2r(r5, r4); \
243 punpcklwd_r2r(r3, r2); \
245 punpckldq_r2r(r2, r0); \
246 punpckhdq_r2r(r2, r1); \
248 movq_r2m(r0, *I(0)); \
249 punpckhwd_r2r(r3, r5); \
250 movq_r2m(r1, *I(1)); \
251 punpckhdq_r2r(r5, r4); \
252 punpckldq_r2r(r5, r2); \
253 movq_r2m(r4, *I(3)); \
257 void vp3_dsp_init_mmx(void)
263 idct_constants
[--j
] = 0;
266 idct_constants
[0] = idct_constants
[5] =
267 idct_constants
[10] = idct_constants
[15] = 65535;
271 p
= idct_constants
+ ((j
+ 3) << 2);
272 p
[0] = p
[1] = p
[2] = p
[3] = idct_cosine_table
[j
- 1];
275 idct_constants
[44] = idct_constants
[45] =
276 idct_constants
[46] = idct_constants
[47] = IdctAdjustBeforeShift
;
279 static void vp3_idct_mmx(int16_t *input_data
, int16_t *dequant_matrix
,
280 int16_t *output_data
)
282 /* eax = quantized input
283 * ebx = dequantizer matrix
284 * ecx = IDCT constants
285 * M(I) = ecx + MaskOffset(0) + I * 8
286 * C(I) = ecx + CosineOffset(32) + (I-1) * 8
291 #define M(x) (idct_constants + x * 4)
292 #define C(x) (idct_constants + 16 + (x - 1) * 4)
293 #define Eight (idct_constants + 44)
295 movq_m2r(*input_data
, r0
);
296 pmullw_m2r(*dequant_matrix
, r0
);
297 movq_m2r(*(input_data
+ 8), r1
);
298 pmullw_m2r(*(dequant_matrix
+ 8), r1
);
301 movq_m2r(*(input_data
+ 4), r4
);
303 pmullw_m2r(*(dequant_matrix
+ 4), r4
);
317 movq_r2m(r0
, *output_data
);
320 movq_m2r(*(input_data
+ 16), r0
);
322 pmullw_m2r(*(dequant_matrix
+ 16), r0
);
326 movq_m2r(*(input_data
+ 12), r3
);
328 pmullw_m2r(*(dequant_matrix
+ 12), r3
);
330 movq_r2m(r7
, *(output_data
+ 8));
339 movq_r2m(r7
, *(output_data
+ 40));
346 movq_m2r(*(input_data
+ 24), r5
);
348 pmullw_m2r(*(dequant_matrix
+ 24), r5
);
350 movq_r2m(r1
, *(output_data
+ 32));
361 movq_r2m(r7
, *(output_data
+ 16));
366 movq_m2r(*(input_data
+ 28), r1
);
368 pmullw_m2r(*(dequant_matrix
+ 28), r1
);
372 movq_r2m(r7
, *(output_data
+ 24));
385 movq_m2r(*(input_data
+ 40), r0
);
387 pmullw_m2r(*(dequant_matrix
+ 40), r0
);
389 movq_r2m(r7
, *(output_data
+ 4));
398 movq_m2r(*(input_data
+ 44), r4
);
400 pmullw_m2r(*(dequant_matrix
+ 44), r4
);
402 movq_r2m(r7
, *(output_data
+ 12));
407 movq_m2r(*(input_data
+ 52), r6
);
409 pmullw_m2r(*(input_data
+ 52), r6
);
423 movq_r2m(r7
, *(output_data
+ 20));
426 movq_m2r(*(input_data
+ 60), r7
);
428 pmullw_m2r(*(input_data
+ 60), r7
);
432 movq_r2m(r0
, *(output_data
+ 28));
435 movq_m2r(*(input_data
+ 56), r0
);
437 pmullw_m2r(*(dequant_matrix
+ 56), r0
);
445 movq_m2r(*(input_data
+ 48), r1
);
447 pmullw_m2r(*(dequant_matrix
+ 48), r1
);
451 movq_r2m(r7
, *(output_data
+ 60));
460 movq_r2m(r6
, *(output_data
+ 52));
463 movq_m2r(*(input_data
+ 36), r5
);
465 pmullw_m2r(*(dequant_matrix
+ 36), r5
);
483 movq_r2m(r7
, *(output_data
+ 44));
486 movq_m2r(*(input_data
+ 32), r7
);
488 pmullw_m2r(*(dequant_matrix
+ 32), r7
);
492 movq_r2m(r6
, *(output_data
+ 36));
497 movq_m2r(*(input_data
+ 20), r5
);
499 pmullw_m2r(*(dequant_matrix
+ 40), r5
);
509 movq_r2m(r0
, *(output_data
+ 56));
518 por_m2r(*(output_data
+ 40), r3
);
523 movq_r2m(r6
, *(output_data
+ 48));
532 por_m2r(*(output_data
+ 32), r7
);
536 movq_r2m(r6
, *(output_data
+ 40));
537 movq_r2m(r7
, *(output_data
+ 32));
542 /* at this point, function has completed dequantization + dezigzag +
543 * partial transposition; now do the idct itself */
545 #define I(K) (output_data + K * 8)
546 #define J(K) (output_data + ((K - 4) * 8) + 4)
553 #define I(K) (output_data + (K * 8) + 32)
554 #define J(K) (output_data + ((K - 4) * 8) + 36)
561 #define I(K) (output_data + K * 8)
562 #define J(K) (output_data + K * 8)
568 #define I(K) (output_data + (K * 8) + 4)
569 #define J(K) (output_data + (K * 8) + 4)
578 void vp3_idct_put_mmx(int16_t *input_data
, int16_t *dequant_matrix
,
579 int coeff_count
, uint8_t *dest
, int stride
)
581 int16_t transformed_data
[64];
584 uint8_t vector128
[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
586 vp3_idct_mmx(input_data
, dequant_matrix
, transformed_data
);
588 /* place in final output */
589 op
= transformed_data
;
590 movq_m2r(*vector128
, mm0
);
591 for (i
= 0; i
< 8; i
++) {
593 for (j
= 0; j
< 8; j
++) {
599 *dest
= (uint8_t)(*op
+ 128);
603 dest
+= (stride
- 8);
605 /* prototype optimization */
607 packsswb_m2r(*(op
+ 4), mm1
);
610 packsswb_m2r(*(op
+ 0), mm1
);
614 movq_r2m(mm1
, *dest
);
619 /* be a good MMX citizen */
623 void vp3_idct_add_mmx(int16_t *input_data
, int16_t *dequant_matrix
,
624 int coeff_count
, uint8_t *dest
, int stride
)
626 int16_t transformed_data
[64];
631 vp3_idct_mmx(input_data
, dequant_matrix
, transformed_data
);
633 /* place in final output */
634 op
= transformed_data
;
635 for (i
= 0; i
< 8; i
++) {
636 for (j
= 0; j
< 8; j
++) {
637 sample
= *dest
+ *op
;
640 else if (sample
> 255)
643 *dest
= (uint8_t)(sample
& 0xFF);
647 dest
+= (stride
- 8);
650 /* be a good MMX citizen */