Initial revision
[libav.git] / libav / jrevdct.c
1 /*
2 * jrevdct.c
3 *
4 * Copyright (C) 1991, 1992, Thomas G. Lane.
5 * This file is part of the Independent JPEG Group's software.
6 * For conditions of distribution and use, see the accompanying README file.
7 *
8 * This file contains the basic inverse-DCT transformation subroutine.
9 *
10 * This implementation is based on an algorithm described in
11 * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
12 * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
13 * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
14 * The primary algorithm described there uses 11 multiplies and 29 adds.
15 * We use their alternate method with 12 multiplies and 32 adds.
16 * The advantage of this method is that no data path contains more than one
17 * multiplication; this allows a very simple and accurate implementation in
18 * scaled fixed-point arithmetic, with a minimal number of shifts.
19 *
20 * I've made lots of modifications to attempt to take advantage of the
21 * sparse nature of the DCT matrices we're getting. Although the logic
22 * is cumbersome, it's straightforward and the resulting code is much
23 * faster.
24 *
25 * A better way to do this would be to pass in the DCT block as a sparse
26 * matrix, perhaps with the difference cases encoded.
27 */
28
29 typedef int INT32;
30
31 /* Definition of Contant integer scale factor. */
32 #define CONST_BITS 13
33
34 /* Misc DCT definitions */
35 #define DCTSIZE 8 /* The basic DCT block is 8x8 samples */
36 #define DCTSIZE2 64 /* DCTSIZE squared; # of elements in a block */
37
38 #define GLOBAL /* a function referenced thru EXTERNs */
39
40 typedef int DCTELEM;
41 typedef DCTELEM DCTBLOCK[DCTSIZE2];
42
43 void j_rev_dct (DCTELEM *data);
44
45
46 #define GLOBAL /* a function referenced thru EXTERNs */
47 #define ORIG_DCT 1
48
49 /* We assume that right shift corresponds to signed division by 2 with
50 * rounding towards minus infinity. This is correct for typical "arithmetic
51 * shift" instructions that shift in copies of the sign bit. But some
52 * C compilers implement >> with an unsigned shift. For these machines you
53 * must define RIGHT_SHIFT_IS_UNSIGNED.
54 * RIGHT_SHIFT provides a proper signed right shift of an INT32 quantity.
55 * It is only applied with constant shift counts. SHIFT_TEMPS must be
56 * included in the variables of any routine using RIGHT_SHIFT.
57 */
58
59 #ifdef RIGHT_SHIFT_IS_UNSIGNED
60 #define SHIFT_TEMPS INT32 shift_temp;
61 #define RIGHT_SHIFT(x,shft) \
62 ((shift_temp = (x)) < 0 ? \
63 (shift_temp >> (shft)) | ((~((INT32) 0)) << (32-(shft))) : \
64 (shift_temp >> (shft)))
65 #else
66 #define SHIFT_TEMPS
67 #define RIGHT_SHIFT(x,shft) ((x) >> (shft))
68 #endif
69
70 /*
71 * This routine is specialized to the case DCTSIZE = 8.
72 */
73
74 #if DCTSIZE != 8
75 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
76 #endif
77
78
79 /*
80 * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
81 * on each column. Direct algorithms are also available, but they are
82 * much more complex and seem not to be any faster when reduced to code.
83 *
84 * The poop on this scaling stuff is as follows:
85 *
86 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
87 * larger than the true IDCT outputs. The final outputs are therefore
88 * a factor of N larger than desired; since N=8 this can be cured by
89 * a simple right shift at the end of the algorithm. The advantage of
90 * this arrangement is that we save two multiplications per 1-D IDCT,
91 * because the y0 and y4 inputs need not be divided by sqrt(N).
92 *
93 * We have to do addition and subtraction of the integer inputs, which
94 * is no problem, and multiplication by fractional constants, which is
95 * a problem to do in integer arithmetic. We multiply all the constants
96 * by CONST_SCALE and convert them to integer constants (thus retaining
97 * CONST_BITS bits of precision in the constants). After doing a
98 * multiplication we have to divide the product by CONST_SCALE, with proper
99 * rounding, to produce the correct output. This division can be done
100 * cheaply as a right shift of CONST_BITS bits. We postpone shifting
101 * as long as possible so that partial sums can be added together with
102 * full fractional precision.
103 *
104 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
105 * they are represented to better-than-integral precision. These outputs
106 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
107 * with the recommended scaling. (To scale up 12-bit sample data further, an
108 * intermediate INT32 array would be needed.)
109 *
110 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
111 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
112 * shows that the values given below are the most effective.
113 */
114
115 #ifdef EIGHT_BIT_SAMPLES
116 #define PASS1_BITS 2
117 #else
118 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
119 #endif
120
121 #define ONE ((INT32) 1)
122
123 #define CONST_SCALE (ONE << CONST_BITS)
124
125 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
126 * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
127 * you will pay a significant penalty in run time. In that case, figure
128 * the correct integer constant values and insert them by hand.
129 */
130
131 #define FIX(x) ((INT32) ((x) * CONST_SCALE + 0.5))
132
133 /* Descale and correctly round an INT32 value that's scaled by N bits.
134 * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
135 * the fudge factor is correct for either sign of X.
136 */
137
138 #define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
139 #define SCALE(x,n) ((INT32)(x) << n)
140
141 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
142 * For 8-bit samples with the recommended scaling, all the variable
143 * and constant values involved are no more than 16 bits wide, so a
144 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
145 * this provides a useful speedup on many machines.
146 * There is no way to specify a 16x16->32 multiply in portable C, but
147 * some C compilers will do the right thing if you provide the correct
148 * combination of casts.
149 * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
150 */
151
152 #ifdef EIGHT_BIT_SAMPLES
153 #ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */
154 #define MULTIPLY(var,const) (((INT16) (var)) * ((INT16) (const)))
155 #endif
156 #ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */
157 #define MULTIPLY(var,const) (((INT16) (var)) * ((INT32) (const)))
158 #endif
159 #endif
160
161 #if 0
162 /* force a multiplication for x86 where a multiply is fast). We
163 force the non constant operand to be in a register because
164 otherwise it may be a 16 bit memory reference, which is not allowed
165 by imull */
166 #define MULTIPLY(a,b) \
167 ({\
168 int res;\
169 asm("imull %2,%1,%0" : "=r" (res) : "r" ((int)(a)), "i" (b));\
170 res;\
171 })
172 #endif
173
174 #ifndef MULTIPLY /* default definition */
175 #define MULTIPLY(var,const) ((var) * (const))
176 #endif
177
178
179 #ifndef ORIG_DCT
180
181 #undef SSMUL
182 #define SSMUL(var1,var2) ((INT16)(var1) * (INT32)(INT16)(var2))
183
184 /* Precomputed idct value arrays. */
185
186 STATIC DCTELEM PreIDCT[64][64];
187
188 /* Pre compute singleton coefficient IDCT values. */
189 void init_pre_idct() {
190 int i;
191
192 for (i = 0; i < 64; i++) {
193 memset ((char *) PreIDCT[i], 0, 64 * sizeof(DCTELEM));
194 PreIDCT[i][i] = 2048;
195 j_rev_dct (PreIDCT[i]);
196 }
197 }
198
199 /*
200 * Perform the inverse DCT on one block of coefficients.
201 */
202
203 void j_rev_dct_sparse (data, pos)
204 DCTBLOCK data;
205 int pos;
206 {
207 register DCTELEM *dataptr;
208 short int val;
209 DCTELEM *ndataptr;
210 int coeff, rr;
211
212 /* If DC Coefficient. */
213
214 if (pos == 0) {
215 register INT32 *dp;
216 register INT32 v;
217
218 dp = (INT32*)data;
219 v = *data;
220 /* Compute 32 bit value to assign.
221 * This speeds things up a bit */
222 if (v < 0)
223 val = (short)((v - 3) >> 3);
224 else
225 val = (short)((v + 4) >> 3);
226 v = val | ((INT32)val << 16);
227 dp[0] = v; dp[1] = v; dp[2] = v; dp[3] = v;
228 dp[4] = v; dp[5] = v; dp[6] = v; dp[7] = v;
229 dp[8] = v; dp[9] = v; dp[10] = v; dp[11] = v;
230 dp[12] = v; dp[13] = v; dp[14] = v; dp[15] = v;
231 dp[16] = v; dp[17] = v; dp[18] = v; dp[19] = v;
232 dp[20] = v; dp[21] = v; dp[22] = v; dp[23] = v;
233 dp[24] = v; dp[25] = v; dp[26] = v; dp[27] = v;
234 dp[28] = v; dp[29] = v; dp[30] = v; dp[31] = v;
235 return;
236 }
237
238 /* Some other coefficient. */
239 dataptr = (DCTELEM *)data;
240 coeff = dataptr[pos];
241 ndataptr = PreIDCT[pos];
242
243 for (rr = 0; rr < 4; rr++) {
244 dataptr[0] = (DCTELEM)(SSMUL (ndataptr[0] , coeff) >> (CONST_BITS-2));
245 dataptr[1] = (DCTELEM)(SSMUL (ndataptr[1] , coeff) >> (CONST_BITS-2));
246 dataptr[2] = (DCTELEM)(SSMUL (ndataptr[2] , coeff) >> (CONST_BITS-2));
247 dataptr[3] = (DCTELEM)(SSMUL (ndataptr[3] , coeff) >> (CONST_BITS-2));
248 dataptr[4] = (DCTELEM)(SSMUL (ndataptr[4] , coeff) >> (CONST_BITS-2));
249 dataptr[5] = (DCTELEM)(SSMUL (ndataptr[5] , coeff) >> (CONST_BITS-2));
250 dataptr[6] = (DCTELEM)(SSMUL (ndataptr[6] , coeff) >> (CONST_BITS-2));
251 dataptr[7] = (DCTELEM)(SSMUL (ndataptr[7] , coeff) >> (CONST_BITS-2));
252 dataptr[8] = (DCTELEM)(SSMUL (ndataptr[8] , coeff) >> (CONST_BITS-2));
253 dataptr[9] = (DCTELEM)(SSMUL (ndataptr[9] , coeff) >> (CONST_BITS-2));
254 dataptr[10] = (DCTELEM)(SSMUL (ndataptr[10], coeff) >> (CONST_BITS-2));
255 dataptr[11] = (DCTELEM)(SSMUL (ndataptr[11], coeff) >> (CONST_BITS-2));
256 dataptr[12] = (DCTELEM)(SSMUL (ndataptr[12], coeff) >> (CONST_BITS-2));
257 dataptr[13] = (DCTELEM)(SSMUL (ndataptr[13], coeff) >> (CONST_BITS-2));
258 dataptr[14] = (DCTELEM)(SSMUL (ndataptr[14], coeff) >> (CONST_BITS-2));
259 dataptr[15] = (DCTELEM)(SSMUL (ndataptr[15], coeff) >> (CONST_BITS-2));
260 dataptr += 16;
261 ndataptr += 16;
262 }
263 }
264
265
266 void j_rev_dct (data)
267 DCTBLOCK data;
268 {
269 INT32 tmp0, tmp1, tmp2, tmp3;
270 INT32 tmp10, tmp11, tmp12, tmp13;
271 INT32 z1, z2, z3, z4, z5;
272 int d0, d1, d2, d3, d4, d5, d6, d7;
273 register DCTELEM *dataptr;
274 int rowctr;
275 SHIFT_TEMPS;
276
277 /* Pass 1: process rows. */
278 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
279 /* furthermore, we scale the results by 2**PASS1_BITS. */
280
281 dataptr = data;
282
283 for (rowctr = DCTSIZE - 1; rowctr >= 0; rowctr--) {
284 /* Due to quantization, we will usually find that many of the input
285 * coefficients are zero, especially the AC terms. We can exploit this
286 * by short-circuiting the IDCT calculation for any row in which all
287 * the AC terms are zero. In that case each output is equal to the
288 * DC coefficient (with scale factor as needed).
289 * With typical images and quantization tables, half or more of the
290 * row DCT calculations can be simplified this way.
291 */
292
293 register INT32 *idataptr = (INT32*)dataptr;
294 d0 = dataptr[0];
295 d1 = dataptr[1];
296 if ((d1 == 0) && (idataptr[1] | idataptr[2] | idataptr[3]) == 0) {
297 /* AC terms all zero */
298 if (d0) {
299 /* Compute a 32 bit value to assign. */
300 DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS);
301 register INT32 v = (dcval & 0xffff) |
302 (((INT32)dcval << 16) & 0xffff0000L);
303
304 idataptr[0] = v;
305 idataptr[1] = v;
306 idataptr[2] = v;
307 idataptr[3] = v;
308 }
309
310 dataptr += DCTSIZE; /* advance pointer to next row */
311 continue;
312 }
313 d2 = dataptr[2];
314 d3 = dataptr[3];
315 d4 = dataptr[4];
316 d5 = dataptr[5];
317 d6 = dataptr[6];
318 d7 = dataptr[7];
319
320 /* Even part: reverse the even part of the forward DCT. */
321 /* The rotator is sqrt(2)*c(-6). */
322 if (d6) {
323 if (d4) {
324 if (d2) {
325 if (d0) {
326 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
327 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
328 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
329 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
330
331 tmp0 = SCALE (d0 + d4, CONST_BITS);
332 tmp1 = SCALE (d0 - d4, CONST_BITS);
333
334 tmp10 = tmp0 + tmp3;
335 tmp13 = tmp0 - tmp3;
336 tmp11 = tmp1 + tmp2;
337 tmp12 = tmp1 - tmp2;
338 } else {
339 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
340 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
341 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
342 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
343
344 tmp0 = SCALE (d4, CONST_BITS);
345
346 tmp10 = tmp0 + tmp3;
347 tmp13 = tmp0 - tmp3;
348 tmp11 = tmp2 - tmp0;
349 tmp12 = -(tmp0 + tmp2);
350 }
351 } else {
352 if (d0) {
353 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
354 tmp2 = MULTIPLY(d6, - FIX(1.306562965));
355 tmp3 = MULTIPLY(d6, FIX(0.541196100));
356
357 tmp0 = SCALE (d0 + d4, CONST_BITS);
358 tmp1 = SCALE (d0 - d4, CONST_BITS);
359
360 tmp10 = tmp0 + tmp3;
361 tmp13 = tmp0 - tmp3;
362 tmp11 = tmp1 + tmp2;
363 tmp12 = tmp1 - tmp2;
364 } else {
365 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
366 tmp2 = MULTIPLY(d6, -FIX(1.306562965));
367 tmp3 = MULTIPLY(d6, FIX(0.541196100));
368
369 tmp0 = SCALE (d4, CONST_BITS);
370
371 tmp10 = tmp0 + tmp3;
372 tmp13 = tmp0 - tmp3;
373 tmp11 = tmp2 - tmp0;
374 tmp12 = -(tmp0 + tmp2);
375 }
376 }
377 } else {
378 if (d2) {
379 if (d0) {
380 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
381 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
382 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
383 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
384
385 tmp0 = SCALE (d0, CONST_BITS);
386
387 tmp10 = tmp0 + tmp3;
388 tmp13 = tmp0 - tmp3;
389 tmp11 = tmp0 + tmp2;
390 tmp12 = tmp0 - tmp2;
391 } else {
392 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
393 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
394 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
395 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
396
397 tmp10 = tmp3;
398 tmp13 = -tmp3;
399 tmp11 = tmp2;
400 tmp12 = -tmp2;
401 }
402 } else {
403 if (d0) {
404 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
405 tmp2 = MULTIPLY(d6, - FIX(1.306562965));
406 tmp3 = MULTIPLY(d6, FIX(0.541196100));
407
408 tmp0 = SCALE (d0, CONST_BITS);
409
410 tmp10 = tmp0 + tmp3;
411 tmp13 = tmp0 - tmp3;
412 tmp11 = tmp0 + tmp2;
413 tmp12 = tmp0 - tmp2;
414 } else {
415 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
416 tmp2 = MULTIPLY(d6, - FIX(1.306562965));
417 tmp3 = MULTIPLY(d6, FIX(0.541196100));
418
419 tmp10 = tmp3;
420 tmp13 = -tmp3;
421 tmp11 = tmp2;
422 tmp12 = -tmp2;
423 }
424 }
425 }
426 } else {
427 if (d4) {
428 if (d2) {
429 if (d0) {
430 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
431 tmp2 = MULTIPLY(d2, FIX(0.541196100));
432 tmp3 = MULTIPLY(d2, FIX(1.306562965));
433
434 tmp0 = SCALE (d0 + d4, CONST_BITS);
435 tmp1 = SCALE (d0 - d4, CONST_BITS);
436
437 tmp10 = tmp0 + tmp3;
438 tmp13 = tmp0 - tmp3;
439 tmp11 = tmp1 + tmp2;
440 tmp12 = tmp1 - tmp2;
441 } else {
442 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
443 tmp2 = MULTIPLY(d2, FIX(0.541196100));
444 tmp3 = MULTIPLY(d2, FIX(1.306562965));
445
446 tmp0 = SCALE (d4, CONST_BITS);
447
448 tmp10 = tmp0 + tmp3;
449 tmp13 = tmp0 - tmp3;
450 tmp11 = tmp2 - tmp0;
451 tmp12 = -(tmp0 + tmp2);
452 }
453 } else {
454 if (d0) {
455 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
456 tmp10 = tmp13 = SCALE (d0 + d4, CONST_BITS);
457 tmp11 = tmp12 = SCALE (d0 - d4, CONST_BITS);
458 } else {
459 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
460 tmp10 = tmp13 = SCALE (d4, CONST_BITS);
461 tmp11 = tmp12 = -tmp10;
462 }
463 }
464 } else {
465 if (d2) {
466 if (d0) {
467 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
468 tmp2 = MULTIPLY(d2, FIX(0.541196100));
469 tmp3 = MULTIPLY(d2, FIX(1.306562965));
470
471 tmp0 = SCALE (d0, CONST_BITS);
472
473 tmp10 = tmp0 + tmp3;
474 tmp13 = tmp0 - tmp3;
475 tmp11 = tmp0 + tmp2;
476 tmp12 = tmp0 - tmp2;
477 } else {
478 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
479 tmp2 = MULTIPLY(d2, FIX(0.541196100));
480 tmp3 = MULTIPLY(d2, FIX(1.306562965));
481
482 tmp10 = tmp3;
483 tmp13 = -tmp3;
484 tmp11 = tmp2;
485 tmp12 = -tmp2;
486 }
487 } else {
488 if (d0) {
489 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
490 tmp10 = tmp13 = tmp11 = tmp12 = SCALE (d0, CONST_BITS);
491 } else {
492 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
493 tmp10 = tmp13 = tmp11 = tmp12 = 0;
494 }
495 }
496 }
497 }
498
499
500 /* Odd part per figure 8; the matrix is unitary and hence its
501 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
502 */
503
504 if (d7) {
505 if (d5) {
506 if (d3) {
507 if (d1) {
508 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
509 z1 = d7 + d1;
510 z2 = d5 + d3;
511 z3 = d7 + d3;
512 z4 = d5 + d1;
513 z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
514
515 tmp0 = MULTIPLY(d7, FIX(0.298631336));
516 tmp1 = MULTIPLY(d5, FIX(2.053119869));
517 tmp2 = MULTIPLY(d3, FIX(3.072711026));
518 tmp3 = MULTIPLY(d1, FIX(1.501321110));
519 z1 = MULTIPLY(z1, - FIX(0.899976223));
520 z2 = MULTIPLY(z2, - FIX(2.562915447));
521 z3 = MULTIPLY(z3, - FIX(1.961570560));
522 z4 = MULTIPLY(z4, - FIX(0.390180644));
523
524 z3 += z5;
525 z4 += z5;
526
527 tmp0 += z1 + z3;
528 tmp1 += z2 + z4;
529 tmp2 += z2 + z3;
530 tmp3 += z1 + z4;
531 } else {
532 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
533 z1 = d7;
534 z2 = d5 + d3;
535 z3 = d7 + d3;
536 z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
537
538 tmp0 = MULTIPLY(d7, FIX(0.298631336));
539 tmp1 = MULTIPLY(d5, FIX(2.053119869));
540 tmp2 = MULTIPLY(d3, FIX(3.072711026));
541 z1 = MULTIPLY(d7, - FIX(0.899976223));
542 z2 = MULTIPLY(z2, - FIX(2.562915447));
543 z3 = MULTIPLY(z3, - FIX(1.961570560));
544 z4 = MULTIPLY(d5, - FIX(0.390180644));
545
546 z3 += z5;
547 z4 += z5;
548
549 tmp0 += z1 + z3;
550 tmp1 += z2 + z4;
551 tmp2 += z2 + z3;
552 tmp3 = z1 + z4;
553 }
554 } else {
555 if (d1) {
556 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
557 z1 = d7 + d1;
558 z2 = d5;
559 z3 = d7;
560 z4 = d5 + d1;
561 z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
562
563 tmp0 = MULTIPLY(d7, FIX(0.298631336));
564 tmp1 = MULTIPLY(d5, FIX(2.053119869));
565 tmp3 = MULTIPLY(d1, FIX(1.501321110));
566 z1 = MULTIPLY(z1, - FIX(0.899976223));
567 z2 = MULTIPLY(d5, - FIX(2.562915447));
568 z3 = MULTIPLY(d7, - FIX(1.961570560));
569 z4 = MULTIPLY(z4, - FIX(0.390180644));
570
571 z3 += z5;
572 z4 += z5;
573
574 tmp0 += z1 + z3;
575 tmp1 += z2 + z4;
576 tmp2 = z2 + z3;
577 tmp3 += z1 + z4;
578 } else {
579 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
580 tmp0 = MULTIPLY(d7, - FIX(0.601344887));
581 z1 = MULTIPLY(d7, - FIX(0.899976223));
582 z3 = MULTIPLY(d7, - FIX(1.961570560));
583 tmp1 = MULTIPLY(d5, - FIX(0.509795578));
584 z2 = MULTIPLY(d5, - FIX(2.562915447));
585 z4 = MULTIPLY(d5, - FIX(0.390180644));
586 z5 = MULTIPLY(d5 + d7, FIX(1.175875602));
587
588 z3 += z5;
589 z4 += z5;
590
591 tmp0 += z3;
592 tmp1 += z4;
593 tmp2 = z2 + z3;
594 tmp3 = z1 + z4;
595 }
596 }
597 } else {
598 if (d3) {
599 if (d1) {
600 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
601 z1 = d7 + d1;
602 z3 = d7 + d3;
603 z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
604
605 tmp0 = MULTIPLY(d7, FIX(0.298631336));
606 tmp2 = MULTIPLY(d3, FIX(3.072711026));
607 tmp3 = MULTIPLY(d1, FIX(1.501321110));
608 z1 = MULTIPLY(z1, - FIX(0.899976223));
609 z2 = MULTIPLY(d3, - FIX(2.562915447));
610 z3 = MULTIPLY(z3, - FIX(1.961570560));
611 z4 = MULTIPLY(d1, - FIX(0.390180644));
612
613 z3 += z5;
614 z4 += z5;
615
616 tmp0 += z1 + z3;
617 tmp1 = z2 + z4;
618 tmp2 += z2 + z3;
619 tmp3 += z1 + z4;
620 } else {
621 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
622 z3 = d7 + d3;
623
624 tmp0 = MULTIPLY(d7, - FIX(0.601344887));
625 z1 = MULTIPLY(d7, - FIX(0.899976223));
626 tmp2 = MULTIPLY(d3, FIX(0.509795579));
627 z2 = MULTIPLY(d3, - FIX(2.562915447));
628 z5 = MULTIPLY(z3, FIX(1.175875602));
629 z3 = MULTIPLY(z3, - FIX(0.785694958));
630
631 tmp0 += z3;
632 tmp1 = z2 + z5;
633 tmp2 += z3;
634 tmp3 = z1 + z5;
635 }
636 } else {
637 if (d1) {
638 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
639 z1 = d7 + d1;
640 z5 = MULTIPLY(z1, FIX(1.175875602));
641
642 z1 = MULTIPLY(z1, FIX(0.275899379));
643 z3 = MULTIPLY(d7, - FIX(1.961570560));
644 tmp0 = MULTIPLY(d7, - FIX(1.662939224));
645 z4 = MULTIPLY(d1, - FIX(0.390180644));
646 tmp3 = MULTIPLY(d1, FIX(1.111140466));
647
648 tmp0 += z1;
649 tmp1 = z4 + z5;
650 tmp2 = z3 + z5;
651 tmp3 += z1;
652 } else {
653 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
654 tmp0 = MULTIPLY(d7, - FIX(1.387039845));
655 tmp1 = MULTIPLY(d7, FIX(1.175875602));
656 tmp2 = MULTIPLY(d7, - FIX(0.785694958));
657 tmp3 = MULTIPLY(d7, FIX(0.275899379));
658 }
659 }
660 }
661 } else {
662 if (d5) {
663 if (d3) {
664 if (d1) {
665 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
666 z2 = d5 + d3;
667 z4 = d5 + d1;
668 z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
669
670 tmp1 = MULTIPLY(d5, FIX(2.053119869));
671 tmp2 = MULTIPLY(d3, FIX(3.072711026));
672 tmp3 = MULTIPLY(d1, FIX(1.501321110));
673 z1 = MULTIPLY(d1, - FIX(0.899976223));
674 z2 = MULTIPLY(z2, - FIX(2.562915447));
675 z3 = MULTIPLY(d3, - FIX(1.961570560));
676 z4 = MULTIPLY(z4, - FIX(0.390180644));
677
678 z3 += z5;
679 z4 += z5;
680
681 tmp0 = z1 + z3;
682 tmp1 += z2 + z4;
683 tmp2 += z2 + z3;
684 tmp3 += z1 + z4;
685 } else {
686 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
687 z2 = d5 + d3;
688
689 z5 = MULTIPLY(z2, FIX(1.175875602));
690 tmp1 = MULTIPLY(d5, FIX(1.662939225));
691 z4 = MULTIPLY(d5, - FIX(0.390180644));
692 z2 = MULTIPLY(z2, - FIX(1.387039845));
693 tmp2 = MULTIPLY(d3, FIX(1.111140466));
694 z3 = MULTIPLY(d3, - FIX(1.961570560));
695
696 tmp0 = z3 + z5;
697 tmp1 += z2;
698 tmp2 += z2;
699 tmp3 = z4 + z5;
700 }
701 } else {
702 if (d1) {
703 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
704 z4 = d5 + d1;
705
706 z5 = MULTIPLY(z4, FIX(1.175875602));
707 z1 = MULTIPLY(d1, - FIX(0.899976223));
708 tmp3 = MULTIPLY(d1, FIX(0.601344887));
709 tmp1 = MULTIPLY(d5, - FIX(0.509795578));
710 z2 = MULTIPLY(d5, - FIX(2.562915447));
711 z4 = MULTIPLY(z4, FIX(0.785694958));
712
713 tmp0 = z1 + z5;
714 tmp1 += z4;
715 tmp2 = z2 + z5;
716 tmp3 += z4;
717 } else {
718 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
719 tmp0 = MULTIPLY(d5, FIX(1.175875602));
720 tmp1 = MULTIPLY(d5, FIX(0.275899380));
721 tmp2 = MULTIPLY(d5, - FIX(1.387039845));
722 tmp3 = MULTIPLY(d5, FIX(0.785694958));
723 }
724 }
725 } else {
726 if (d3) {
727 if (d1) {
728 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
729 z5 = d1 + d3;
730 tmp3 = MULTIPLY(d1, FIX(0.211164243));
731 tmp2 = MULTIPLY(d3, - FIX(1.451774981));
732 z1 = MULTIPLY(d1, FIX(1.061594337));
733 z2 = MULTIPLY(d3, - FIX(2.172734803));
734 z4 = MULTIPLY(z5, FIX(0.785694958));
735 z5 = MULTIPLY(z5, FIX(1.175875602));
736
737 tmp0 = z1 - z4;
738 tmp1 = z2 + z4;
739 tmp2 += z5;
740 tmp3 += z5;
741 } else {
742 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
743 tmp0 = MULTIPLY(d3, - FIX(0.785694958));
744 tmp1 = MULTIPLY(d3, - FIX(1.387039845));
745 tmp2 = MULTIPLY(d3, - FIX(0.275899379));
746 tmp3 = MULTIPLY(d3, FIX(1.175875602));
747 }
748 } else {
749 if (d1) {
750 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
751 tmp0 = MULTIPLY(d1, FIX(0.275899379));
752 tmp1 = MULTIPLY(d1, FIX(0.785694958));
753 tmp2 = MULTIPLY(d1, FIX(1.175875602));
754 tmp3 = MULTIPLY(d1, FIX(1.387039845));
755 } else {
756 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
757 tmp0 = tmp1 = tmp2 = tmp3 = 0;
758 }
759 }
760 }
761 }
762
763 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
764
765 dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
766 dataptr[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
767 dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
768 dataptr[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
769 dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
770 dataptr[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
771 dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
772 dataptr[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
773
774 dataptr += DCTSIZE; /* advance pointer to next row */
775 }
776
777 /* Pass 2: process columns. */
778 /* Note that we must descale the results by a factor of 8 == 2**3, */
779 /* and also undo the PASS1_BITS scaling. */
780
781 dataptr = data;
782 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
783 /* Columns of zeroes can be exploited in the same way as we did with rows.
784 * However, the row calculation has created many nonzero AC terms, so the
785 * simplification applies less often (typically 5% to 10% of the time).
786 * On machines with very fast multiplication, it's possible that the
787 * test takes more time than it's worth. In that case this section
788 * may be commented out.
789 */
790
791 d0 = dataptr[DCTSIZE*0];
792 d1 = dataptr[DCTSIZE*1];
793 d2 = dataptr[DCTSIZE*2];
794 d3 = dataptr[DCTSIZE*3];
795 d4 = dataptr[DCTSIZE*4];
796 d5 = dataptr[DCTSIZE*5];
797 d6 = dataptr[DCTSIZE*6];
798 d7 = dataptr[DCTSIZE*7];
799
800 /* Even part: reverse the even part of the forward DCT. */
801 /* The rotator is sqrt(2)*c(-6). */
802 if (d6) {
803 if (d4) {
804 if (d2) {
805 if (d0) {
806 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
807 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
808 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
809 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
810
811 tmp0 = SCALE (d0 + d4, CONST_BITS);
812 tmp1 = SCALE (d0 - d4, CONST_BITS);
813
814 tmp10 = tmp0 + tmp3;
815 tmp13 = tmp0 - tmp3;
816 tmp11 = tmp1 + tmp2;
817 tmp12 = tmp1 - tmp2;
818 } else {
819 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
820 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
821 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
822 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
823
824 tmp0 = SCALE (d4, CONST_BITS);
825
826 tmp10 = tmp0 + tmp3;
827 tmp13 = tmp0 - tmp3;
828 tmp11 = tmp2 - tmp0;
829 tmp12 = -(tmp0 + tmp2);
830 }
831 } else {
832 if (d0) {
833 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
834 tmp2 = MULTIPLY(d6, - FIX(1.306562965));
835 tmp3 = MULTIPLY(d6, FIX(0.541196100));
836
837 tmp0 = SCALE (d0 + d4, CONST_BITS);
838 tmp1 = SCALE (d0 - d4, CONST_BITS);
839
840 tmp10 = tmp0 + tmp3;
841 tmp13 = tmp0 - tmp3;
842 tmp11 = tmp1 + tmp2;
843 tmp12 = tmp1 - tmp2;
844 } else {
845 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
846 tmp2 = MULTIPLY(d6, -FIX(1.306562965));
847 tmp3 = MULTIPLY(d6, FIX(0.541196100));
848
849 tmp0 = SCALE (d4, CONST_BITS);
850
851 tmp10 = tmp0 + tmp3;
852 tmp13 = tmp0 - tmp3;
853 tmp11 = tmp2 - tmp0;
854 tmp12 = -(tmp0 + tmp2);
855 }
856 }
857 } else {
858 if (d2) {
859 if (d0) {
860 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
861 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
862 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
863 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
864
865 tmp0 = SCALE (d0, CONST_BITS);
866
867 tmp10 = tmp0 + tmp3;
868 tmp13 = tmp0 - tmp3;
869 tmp11 = tmp0 + tmp2;
870 tmp12 = tmp0 - tmp2;
871 } else {
872 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
873 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
874 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
875 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
876
877 tmp10 = tmp3;
878 tmp13 = -tmp3;
879 tmp11 = tmp2;
880 tmp12 = -tmp2;
881 }
882 } else {
883 if (d0) {
884 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
885 tmp2 = MULTIPLY(d6, - FIX(1.306562965));
886 tmp3 = MULTIPLY(d6, FIX(0.541196100));
887
888 tmp0 = SCALE (d0, CONST_BITS);
889
890 tmp10 = tmp0 + tmp3;
891 tmp13 = tmp0 - tmp3;
892 tmp11 = tmp0 + tmp2;
893 tmp12 = tmp0 - tmp2;
894 } else {
895 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
896 tmp2 = MULTIPLY(d6, - FIX(1.306562965));
897 tmp3 = MULTIPLY(d6, FIX(0.541196100));
898
899 tmp10 = tmp3;
900 tmp13 = -tmp3;
901 tmp11 = tmp2;
902 tmp12 = -tmp2;
903 }
904 }
905 }
906 } else {
907 if (d4) {
908 if (d2) {
909 if (d0) {
910 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
911 tmp2 = MULTIPLY(d2, FIX(0.541196100));
912 tmp3 = MULTIPLY(d2, FIX(1.306562965));
913
914 tmp0 = SCALE (d0 + d4, CONST_BITS);
915 tmp1 = SCALE (d0 - d4, CONST_BITS);
916
917 tmp10 = tmp0 + tmp3;
918 tmp13 = tmp0 - tmp3;
919 tmp11 = tmp1 + tmp2;
920 tmp12 = tmp1 - tmp2;
921 } else {
922 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
923 tmp2 = MULTIPLY(d2, FIX(0.541196100));
924 tmp3 = MULTIPLY(d2, FIX(1.306562965));
925
926 tmp0 = SCALE (d4, CONST_BITS);
927
928 tmp10 = tmp0 + tmp3;
929 tmp13 = tmp0 - tmp3;
930 tmp11 = tmp2 - tmp0;
931 tmp12 = -(tmp0 + tmp2);
932 }
933 } else {
934 if (d0) {
935 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
936 tmp10 = tmp13 = SCALE (d0 + d4, CONST_BITS);
937 tmp11 = tmp12 = SCALE (d0 - d4, CONST_BITS);
938 } else {
939 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
940 tmp10 = tmp13 = SCALE (d4, CONST_BITS);
941 tmp11 = tmp12 = -tmp10;
942 }
943 }
944 } else {
945 if (d2) {
946 if (d0) {
947 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
948 tmp2 = MULTIPLY(d2, FIX(0.541196100));
949 tmp3 = MULTIPLY(d2, FIX(1.306562965));
950
951 tmp0 = SCALE (d0, CONST_BITS);
952
953 tmp10 = tmp0 + tmp3;
954 tmp13 = tmp0 - tmp3;
955 tmp11 = tmp0 + tmp2;
956 tmp12 = tmp0 - tmp2;
957 } else {
958 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
959 tmp2 = MULTIPLY(d2, FIX(0.541196100));
960 tmp3 = MULTIPLY(d2, FIX(1.306562965));
961
962 tmp10 = tmp3;
963 tmp13 = -tmp3;
964 tmp11 = tmp2;
965 tmp12 = -tmp2;
966 }
967 } else {
968 if (d0) {
969 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
970 tmp10 = tmp13 = tmp11 = tmp12 = SCALE (d0, CONST_BITS);
971 } else {
972 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
973 tmp10 = tmp13 = tmp11 = tmp12 = 0;
974 }
975 }
976 }
977 }
978
979 /* Odd part per figure 8; the matrix is unitary and hence its
980 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
981 */
982 if (d7) {
983 if (d5) {
984 if (d3) {
985 if (d1) {
986 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
987 z1 = d7 + d1;
988 z2 = d5 + d3;
989 z3 = d7 + d3;
990 z4 = d5 + d1;
991 z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
992
993 tmp0 = MULTIPLY(d7, FIX(0.298631336));
994 tmp1 = MULTIPLY(d5, FIX(2.053119869));
995 tmp2 = MULTIPLY(d3, FIX(3.072711026));
996 tmp3 = MULTIPLY(d1, FIX(1.501321110));
997 z1 = MULTIPLY(z1, - FIX(0.899976223));
998 z2 = MULTIPLY(z2, - FIX(2.562915447));
999 z3 = MULTIPLY(z3, - FIX(1.961570560));
1000 z4 = MULTIPLY(z4, - FIX(0.390180644));
1001
1002 z3 += z5;
1003 z4 += z5;
1004
1005 tmp0 += z1 + z3;
1006 tmp1 += z2 + z4;
1007 tmp2 += z2 + z3;
1008 tmp3 += z1 + z4;
1009 } else {
1010 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
1011 z1 = d7;
1012 z2 = d5 + d3;
1013 z3 = d7 + d3;
1014 z5 = MULTIPLY(z3 + d5, FIX(1.175875602));
1015
1016 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1017 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1018 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1019 z1 = MULTIPLY(d7, - FIX(0.899976223));
1020 z2 = MULTIPLY(z2, - FIX(2.562915447));
1021 z3 = MULTIPLY(z3, - FIX(1.961570560));
1022 z4 = MULTIPLY(d5, - FIX(0.390180644));
1023
1024 z3 += z5;
1025 z4 += z5;
1026
1027 tmp0 += z1 + z3;
1028 tmp1 += z2 + z4;
1029 tmp2 += z2 + z3;
1030 tmp3 = z1 + z4;
1031 }
1032 } else {
1033 if (d1) {
1034 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
1035 z1 = d7 + d1;
1036 z2 = d5;
1037 z3 = d7;
1038 z4 = d5 + d1;
1039 z5 = MULTIPLY(z3 + z4, FIX(1.175875602));
1040
1041 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1042 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1043 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1044 z1 = MULTIPLY(z1, - FIX(0.899976223));
1045 z2 = MULTIPLY(d5, - FIX(2.562915447));
1046 z3 = MULTIPLY(d7, - FIX(1.961570560));
1047 z4 = MULTIPLY(z4, - FIX(0.390180644));
1048
1049 z3 += z5;
1050 z4 += z5;
1051
1052 tmp0 += z1 + z3;
1053 tmp1 += z2 + z4;
1054 tmp2 = z2 + z3;
1055 tmp3 += z1 + z4;
1056 } else {
1057 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
1058 tmp0 = MULTIPLY(d7, - FIX(0.601344887));
1059 z1 = MULTIPLY(d7, - FIX(0.899976223));
1060 z3 = MULTIPLY(d7, - FIX(1.961570560));
1061 tmp1 = MULTIPLY(d5, - FIX(0.509795578));
1062 z2 = MULTIPLY(d5, - FIX(2.562915447));
1063 z4 = MULTIPLY(d5, - FIX(0.390180644));
1064 z5 = MULTIPLY(d5 + d7, FIX(1.175875602));
1065
1066 z3 += z5;
1067 z4 += z5;
1068
1069 tmp0 += z3;
1070 tmp1 += z4;
1071 tmp2 = z2 + z3;
1072 tmp3 = z1 + z4;
1073 }
1074 }
1075 } else {
1076 if (d3) {
1077 if (d1) {
1078 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
1079 z1 = d7 + d1;
1080 z3 = d7 + d3;
1081 z5 = MULTIPLY(z3 + d1, FIX(1.175875602));
1082
1083 tmp0 = MULTIPLY(d7, FIX(0.298631336));
1084 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1085 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1086 z1 = MULTIPLY(z1, - FIX(0.899976223));
1087 z2 = MULTIPLY(d3, - FIX(2.562915447));
1088 z3 = MULTIPLY(z3, - FIX(1.961570560));
1089 z4 = MULTIPLY(d1, - FIX(0.390180644));
1090
1091 z3 += z5;
1092 z4 += z5;
1093
1094 tmp0 += z1 + z3;
1095 tmp1 = z2 + z4;
1096 tmp2 += z2 + z3;
1097 tmp3 += z1 + z4;
1098 } else {
1099 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1100 z3 = d7 + d3;
1101
1102 tmp0 = MULTIPLY(d7, - FIX(0.601344887));
1103 z1 = MULTIPLY(d7, - FIX(0.899976223));
1104 tmp2 = MULTIPLY(d3, FIX(0.509795579));
1105 z2 = MULTIPLY(d3, - FIX(2.562915447));
1106 z5 = MULTIPLY(z3, FIX(1.175875602));
1107 z3 = MULTIPLY(z3, - FIX(0.785694958));
1108
1109 tmp0 += z3;
1110 tmp1 = z2 + z5;
1111 tmp2 += z3;
1112 tmp3 = z1 + z5;
1113 }
1114 } else {
1115 if (d1) {
1116 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1117 z1 = d7 + d1;
1118 z5 = MULTIPLY(z1, FIX(1.175875602));
1119
1120 z1 = MULTIPLY(z1, FIX(0.275899379));
1121 z3 = MULTIPLY(d7, - FIX(1.961570560));
1122 tmp0 = MULTIPLY(d7, - FIX(1.662939224));
1123 z4 = MULTIPLY(d1, - FIX(0.390180644));
1124 tmp3 = MULTIPLY(d1, FIX(1.111140466));
1125
1126 tmp0 += z1;
1127 tmp1 = z4 + z5;
1128 tmp2 = z3 + z5;
1129 tmp3 += z1;
1130 } else {
1131 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1132 tmp0 = MULTIPLY(d7, - FIX(1.387039845));
1133 tmp1 = MULTIPLY(d7, FIX(1.175875602));
1134 tmp2 = MULTIPLY(d7, - FIX(0.785694958));
1135 tmp3 = MULTIPLY(d7, FIX(0.275899379));
1136 }
1137 }
1138 }
1139 } else {
1140 if (d5) {
1141 if (d3) {
1142 if (d1) {
1143 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1144 z2 = d5 + d3;
1145 z4 = d5 + d1;
1146 z5 = MULTIPLY(d3 + z4, FIX(1.175875602));
1147
1148 tmp1 = MULTIPLY(d5, FIX(2.053119869));
1149 tmp2 = MULTIPLY(d3, FIX(3.072711026));
1150 tmp3 = MULTIPLY(d1, FIX(1.501321110));
1151 z1 = MULTIPLY(d1, - FIX(0.899976223));
1152 z2 = MULTIPLY(z2, - FIX(2.562915447));
1153 z3 = MULTIPLY(d3, - FIX(1.961570560));
1154 z4 = MULTIPLY(z4, - FIX(0.390180644));
1155
1156 z3 += z5;
1157 z4 += z5;
1158
1159 tmp0 = z1 + z3;
1160 tmp1 += z2 + z4;
1161 tmp2 += z2 + z3;
1162 tmp3 += z1 + z4;
1163 } else {
1164 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1165 z2 = d5 + d3;
1166
1167 z5 = MULTIPLY(z2, FIX(1.175875602));
1168 tmp1 = MULTIPLY(d5, FIX(1.662939225));
1169 z4 = MULTIPLY(d5, - FIX(0.390180644));
1170 z2 = MULTIPLY(z2, - FIX(1.387039845));
1171 tmp2 = MULTIPLY(d3, FIX(1.111140466));
1172 z3 = MULTIPLY(d3, - FIX(1.961570560));
1173
1174 tmp0 = z3 + z5;
1175 tmp1 += z2;
1176 tmp2 += z2;
1177 tmp3 = z4 + z5;
1178 }
1179 } else {
1180 if (d1) {
1181 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1182 z4 = d5 + d1;
1183
1184 z5 = MULTIPLY(z4, FIX(1.175875602));
1185 z1 = MULTIPLY(d1, - FIX(0.899976223));
1186 tmp3 = MULTIPLY(d1, FIX(0.601344887));
1187 tmp1 = MULTIPLY(d5, - FIX(0.509795578));
1188 z2 = MULTIPLY(d5, - FIX(2.562915447));
1189 z4 = MULTIPLY(z4, FIX(0.785694958));
1190
1191 tmp0 = z1 + z5;
1192 tmp1 += z4;
1193 tmp2 = z2 + z5;
1194 tmp3 += z4;
1195 } else {
1196 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1197 tmp0 = MULTIPLY(d5, FIX(1.175875602));
1198 tmp1 = MULTIPLY(d5, FIX(0.275899380));
1199 tmp2 = MULTIPLY(d5, - FIX(1.387039845));
1200 tmp3 = MULTIPLY(d5, FIX(0.785694958));
1201 }
1202 }
1203 } else {
1204 if (d3) {
1205 if (d1) {
1206 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1207 z5 = d1 + d3;
1208 tmp3 = MULTIPLY(d1, FIX(0.211164243));
1209 tmp2 = MULTIPLY(d3, - FIX(1.451774981));
1210 z1 = MULTIPLY(d1, FIX(1.061594337));
1211 z2 = MULTIPLY(d3, - FIX(2.172734803));
1212 z4 = MULTIPLY(z5, FIX(0.785694958));
1213 z5 = MULTIPLY(z5, FIX(1.175875602));
1214
1215 tmp0 = z1 - z4;
1216 tmp1 = z2 + z4;
1217 tmp2 += z5;
1218 tmp3 += z5;
1219 } else {
1220 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1221 tmp0 = MULTIPLY(d3, - FIX(0.785694958));
1222 tmp1 = MULTIPLY(d3, - FIX(1.387039845));
1223 tmp2 = MULTIPLY(d3, - FIX(0.275899379));
1224 tmp3 = MULTIPLY(d3, FIX(1.175875602));
1225 }
1226 } else {
1227 if (d1) {
1228 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1229 tmp0 = MULTIPLY(d1, FIX(0.275899379));
1230 tmp1 = MULTIPLY(d1, FIX(0.785694958));
1231 tmp2 = MULTIPLY(d1, FIX(1.175875602));
1232 tmp3 = MULTIPLY(d1, FIX(1.387039845));
1233 } else {
1234 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1235 tmp0 = tmp1 = tmp2 = tmp3 = 0;
1236 }
1237 }
1238 }
1239 }
1240
1241 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1242
1243 dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3,
1244 CONST_BITS+PASS1_BITS+3);
1245 dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10 - tmp3,
1246 CONST_BITS+PASS1_BITS+3);
1247 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2,
1248 CONST_BITS+PASS1_BITS+3);
1249 dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11 - tmp2,
1250 CONST_BITS+PASS1_BITS+3);
1251 dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1,
1252 CONST_BITS+PASS1_BITS+3);
1253 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12 - tmp1,
1254 CONST_BITS+PASS1_BITS+3);
1255 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0,
1256 CONST_BITS+PASS1_BITS+3);
1257 dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13 - tmp0,
1258 CONST_BITS+PASS1_BITS+3);
1259
1260 dataptr++; /* advance pointer to next column */
1261 }
1262 }
1263
1264 #else
1265
1266 /*---- debugging/tracing macros ----*/
1267
1268 #if _MSC_VER
1269 #pragma optimize("",on)
1270 #if _MSC_VER > 700
1271 /*#pragma optimize("l",off)*/
1272 #endif
1273 #endif
1274
1275 #define idct_single_pos0()
1276 #define idct_zero_col_stat()
1277 #define idct_zero_row_stat()
1278 #define idct_nonzero_col_stat()
1279 #define idct_nonzero_row_stat()
1280 #define DUMP_COEFS(p)
1281 #define TRACE(args)
1282 #define FAST_DCTPTRS 1
1283
1284 #if 0 /* to count cases */
1285 void idct_single_pos0 (void) { static int count; count++; }
1286 void idct_zero_col_stat (void) { static int count; count++; }
1287 void idct_zero_row_stat (void) { static int count; count++; }
1288 void idct_nonzero_col_stat (void) { static int count; count++; }
1289 void idct_nonzero_row_stat (void) { static int count; count++; }
1290 #undef idct_single_pos0
1291 #undef idct_zero_col_stat
1292 #undef idct_zero_row_stat
1293 #undef idct_nonzero_col_stat
1294 #undef idct_nonzero_row_stat
1295 #endif
1296
1297 void init_pre_idct (void) { }
1298
1299 void j_rev_dct_sparse (DCTBLOCK data, int pos)
1300 {
1301 /* If just DC Coefficient. */
1302
1303 if (pos == 0) {
1304 register DCTELEM *dp, *dq;
1305 DCTELEM dcval;
1306
1307 idct_single_pos0();
1308
1309 dp = data;
1310 dcval = dp[0];
1311 if (dcval < 0)
1312 dcval = (short)((dcval - 3) >> 3);
1313 else
1314 dcval = (short)((dcval + 4) >> 3);
1315
1316 if (dcval) {
1317 for (dq = dp + 64; dp < dq; dp += 8) {
1318 dp[3] = dp[2] = dp[1] = dp[0] = dcval;
1319 dp[7] = dp[6] = dp[5] = dp[4] = dcval;
1320 }
1321 }
1322 return;
1323 }
1324
1325 /* Some other coeff */
1326 j_rev_dct (data);
1327 }
1328
1329 #ifndef OPTIMIZE_ASM
1330 void j_rev_dct (DCTBLOCK data)
1331 {
1332 INT32 tmp0, tmp1, tmp2, tmp3;
1333 INT32 tmp10, tmp11, tmp12, tmp13;
1334 INT32 z1, z2, z3, z4, z5;
1335 register DCTELEM *dp;
1336 int rowctr;
1337 SHIFT_TEMPS;
1338
1339 /* Pass 1: process rows. */
1340 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
1341 /* furthermore, we scale the results by 2**PASS1_BITS. */
1342
1343 DUMP_COEFS(data);
1344
1345 dp = data;
1346 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--, dp += DCTSIZE) {
1347 /* Due to quantization, we will usually find that many of the input
1348 * coefficients are zero, especially the AC terms. We can exploit this
1349 * by short-circuiting the IDCT calculation for any row in which all
1350 * the AC terms are zero. In that case each output is equal to the
1351 * DC coefficient (with scale factor as needed).
1352 * With typical images and quantization tables, half or more of the
1353 * row DCT calculations can be simplified this way.
1354 */
1355
1356 #if FAST_DCTPTRS
1357 #define d0 dp[0]
1358 #define d1 dp[1]
1359 #define d2 dp[2]
1360 #define d3 dp[3]
1361 #define d4 dp[4]
1362 #define d5 dp[5]
1363 #define d6 dp[6]
1364 #define d7 dp[7]
1365 #else
1366 int d0 = dp[0];
1367 int d1 = dp[1];
1368 int d2 = dp[2];
1369 int d3 = dp[3];
1370 int d4 = dp[4];
1371 int d5 = dp[5];
1372 int d6 = dp[6];
1373 int d7 = dp[7];
1374 #endif
1375
1376 #ifndef NO_ZERO_ROW_TEST
1377 if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
1378 /* AC terms all zero */
1379 DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS);
1380
1381 if (d0) {
1382 dp[0] = dcval;
1383 dp[1] = dcval;
1384 dp[2] = dcval;
1385 dp[3] = dcval;
1386 dp[4] = dcval;
1387 dp[5] = dcval;
1388 dp[6] = dcval;
1389 dp[7] = dcval;
1390 }
1391 idct_zero_row_stat();
1392 continue;
1393 }
1394 #endif
1395
1396 idct_nonzero_row_stat();
1397
1398 /* Even part: reverse the even part of the forward DCT. */
1399 /* The rotator is sqrt(2)*c(-6). */
1400
1401 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1402 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1403 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1404
1405 tmp0 = SCALE (d0 + d4, CONST_BITS);
1406 tmp1 = SCALE (d0 - d4, CONST_BITS);
1407
1408 tmp10 = tmp0 + tmp3;
1409 tmp13 = tmp0 - tmp3;
1410 tmp11 = tmp1 + tmp2;
1411 tmp12 = tmp1 - tmp2;
1412
1413 /* Odd part per figure 8; the matrix is unitary and hence its
1414 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
1415 */
1416
1417 z1 = d7 + d1;
1418 z2 = d5 + d3;
1419 z3 = d7 + d3;
1420 z4 = d5 + d1;
1421 z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
1422
1423 tmp0 = MULTIPLY(d7, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
1424 tmp1 = MULTIPLY(d5, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
1425 tmp2 = MULTIPLY(d3, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
1426 tmp3 = MULTIPLY(d1, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
1427 z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
1428 z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
1429 z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
1430 z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
1431
1432 z3 += z5;
1433 z4 += z5;
1434
1435 tmp0 += z1 + z3;
1436 tmp1 += z2 + z4;
1437 tmp2 += z2 + z3;
1438 tmp3 += z1 + z4;
1439
1440 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1441
1442 dp[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
1443 dp[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
1444 dp[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
1445 dp[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
1446 dp[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
1447 dp[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
1448 dp[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
1449 dp[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
1450 }
1451 #if FAST_DCTPTRS
1452 #undef d0
1453 #undef d1
1454 #undef d2
1455 #undef d3
1456 #undef d4
1457 #undef d5
1458 #undef d6
1459 #undef d7
1460 #endif
1461
1462 /* Pass 2: process columns. */
1463 /* Note that we must descale the results by a factor of 8 == 2**3, */
1464 /* and also undo the PASS1_BITS scaling. */
1465
1466 dp = data;
1467 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--, dp++) {
1468 /* Columns of zeroes can be exploited in the same way as we did with rows.
1469 * However, the row calculation has created many nonzero AC terms, so the
1470 * simplification applies less often (typically 5% to 10% of the time).
1471 * On machines with very fast multiplication, it's possible that the
1472 * test takes more time than it's worth. In that case this section
1473 * may be commented out.
1474 */
1475
1476 #if FAST_DCTPTRS
1477 #define d0 dp[DCTSIZE*0]
1478 #define d1 dp[DCTSIZE*1]
1479 #define d2 dp[DCTSIZE*2]
1480 #define d3 dp[DCTSIZE*3]
1481 #define d4 dp[DCTSIZE*4]
1482 #define d5 dp[DCTSIZE*5]
1483 #define d6 dp[DCTSIZE*6]
1484 #define d7 dp[DCTSIZE*7]
1485 #else
1486 int d0 = dp[DCTSIZE*0];
1487 int d1 = dp[DCTSIZE*1];
1488 int d2 = dp[DCTSIZE*2];
1489 int d3 = dp[DCTSIZE*3];
1490 int d4 = dp[DCTSIZE*4];
1491 int d5 = dp[DCTSIZE*5];
1492 int d6 = dp[DCTSIZE*6];
1493 int d7 = dp[DCTSIZE*7];
1494 #endif
1495
1496 #ifndef NO_ZERO_COLUMN_TEST
1497 if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
1498 /* AC terms all zero */
1499 DCTELEM dcval = (DCTELEM) DESCALE((INT32) d0, PASS1_BITS+3);
1500
1501 if (d0) {
1502 dp[DCTSIZE*0] = dcval;
1503 dp[DCTSIZE*1] = dcval;
1504 dp[DCTSIZE*2] = dcval;
1505 dp[DCTSIZE*3] = dcval;
1506 dp[DCTSIZE*4] = dcval;
1507 dp[DCTSIZE*5] = dcval;
1508 dp[DCTSIZE*6] = dcval;
1509 dp[DCTSIZE*7] = dcval;
1510 }
1511 idct_zero_col_stat();
1512 continue;
1513 }
1514 #endif
1515
1516 idct_nonzero_col_stat();
1517
1518 /* Even part: reverse the even part of the forward DCT. */
1519 /* The rotator is sqrt(2)*c(-6). */
1520
1521 z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
1522 tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
1523 tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));
1524
1525 tmp0 = SCALE (d0 + d4, CONST_BITS);
1526 tmp1 = SCALE (d0 - d4, CONST_BITS);
1527
1528 tmp10 = tmp0 + tmp3;
1529 tmp13 = tmp0 - tmp3;
1530 tmp11 = tmp1 + tmp2;
1531 tmp12 = tmp1 - tmp2;
1532
1533 /* Odd part per figure 8; the matrix is unitary and hence its
1534 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
1535 */
1536
1537 z1 = d7 + d1;
1538 z2 = d5 + d3;
1539 z3 = d7 + d3;
1540 z4 = d5 + d1;
1541 z5 = MULTIPLY(z3 + z4, FIX(1.175875602)); /* sqrt(2) * c3 */
1542
1543 tmp0 = MULTIPLY(d7, FIX(0.298631336)); /* sqrt(2) * (-c1+c3+c5-c7) */
1544 tmp1 = MULTIPLY(d5, FIX(2.053119869)); /* sqrt(2) * ( c1+c3-c5+c7) */
1545 tmp2 = MULTIPLY(d3, FIX(3.072711026)); /* sqrt(2) * ( c1+c3+c5-c7) */
1546 tmp3 = MULTIPLY(d1, FIX(1.501321110)); /* sqrt(2) * ( c1+c3-c5-c7) */
1547 z1 = MULTIPLY(z1, - FIX(0.899976223)); /* sqrt(2) * (c7-c3) */
1548 z2 = MULTIPLY(z2, - FIX(2.562915447)); /* sqrt(2) * (-c1-c3) */
1549 z3 = MULTIPLY(z3, - FIX(1.961570560)); /* sqrt(2) * (-c3-c5) */
1550 z4 = MULTIPLY(z4, - FIX(0.390180644)); /* sqrt(2) * (c5-c3) */
1551
1552 z3 += z5;
1553 z4 += z5;
1554
1555 tmp0 += z1 + z3;
1556 tmp1 += z2 + z4;
1557 tmp2 += z2 + z3;
1558 tmp3 += z1 + z4;
1559
1560 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1561
1562 dp[DCTSIZE*0] = (DCTELEM)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3);
1563 dp[DCTSIZE*7] = (DCTELEM)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3);
1564 dp[DCTSIZE*1] = (DCTELEM)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3);
1565 dp[DCTSIZE*6] = (DCTELEM)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3);
1566 dp[DCTSIZE*2] = (DCTELEM)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3);
1567 dp[DCTSIZE*5] = (DCTELEM)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3);
1568 dp[DCTSIZE*3] = (DCTELEM)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3);
1569 dp[DCTSIZE*4] = (DCTELEM)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3);
1570 }
1571 #if FAST_DCTPTRS
1572 #undef d0
1573 #undef d1
1574 #undef d2
1575 #undef d3
1576 #undef d4
1577 #undef d5
1578 #undef d6
1579 #undef d7
1580 #endif
1581 }
1582 #endif /* optimize.asm */
1583
1584 #endif