1/2 resolution decoding
[libav.git] / libavcodec / jrevdct.c
CommitLineData
de6d9b64
FB
1/*
2 * jrevdct.c
3 *
4 * Copyright (C) 1991, 1992, Thomas G. Lane.
5 * This file is part of the Independent JPEG Group's software.
6 * For conditions of distribution and use, see the accompanying README file.
7 *
8 * This file contains the basic inverse-DCT transformation subroutine.
9 *
10 * This implementation is based on an algorithm described in
11 * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
12 * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
13 * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
14 * The primary algorithm described there uses 11 multiplies and 29 adds.
15 * We use their alternate method with 12 multiplies and 32 adds.
16 * The advantage of this method is that no data path contains more than one
17 * multiplication; this allows a very simple and accurate implementation in
18 * scaled fixed-point arithmetic, with a minimal number of shifts.
19 *
20 * I've made lots of modifications to attempt to take advantage of the
21 * sparse nature of the DCT matrices we're getting. Although the logic
22 * is cumbersome, it's straightforward and the resulting code is much
23 * faster.
24 *
25 * A better way to do this would be to pass in the DCT block as a sparse
26 * matrix, perhaps with the difference cases encoded.
27 */
983e3246
MN
28
29/**
30 * @file jrevdct.c
31 * Independent JPEG Group's LLM idct.
32 */
33
de6d9b64
FB
34#include "common.h"
35#include "dsputil.h"
36
37#define EIGHT_BIT_SAMPLES
38
39#define DCTSIZE 8
40#define DCTSIZE2 64
41
42#define GLOBAL
43
44#define RIGHT_SHIFT(x, n) ((x) >> (n))
45
46typedef DCTELEM DCTBLOCK[DCTSIZE2];
47
48#define CONST_BITS 13
49
50/*
51 * This routine is specialized to the case DCTSIZE = 8.
52 */
53
54#if DCTSIZE != 8
55 Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
56#endif
57
58
59/*
60 * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
61 * on each column. Direct algorithms are also available, but they are
62 * much more complex and seem not to be any faster when reduced to code.
63 *
64 * The poop on this scaling stuff is as follows:
65 *
66 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
67 * larger than the true IDCT outputs. The final outputs are therefore
68 * a factor of N larger than desired; since N=8 this can be cured by
69 * a simple right shift at the end of the algorithm. The advantage of
70 * this arrangement is that we save two multiplications per 1-D IDCT,
71 * because the y0 and y4 inputs need not be divided by sqrt(N).
72 *
73 * We have to do addition and subtraction of the integer inputs, which
74 * is no problem, and multiplication by fractional constants, which is
75 * a problem to do in integer arithmetic. We multiply all the constants
76 * by CONST_SCALE and convert them to integer constants (thus retaining
77 * CONST_BITS bits of precision in the constants). After doing a
78 * multiplication we have to divide the product by CONST_SCALE, with proper
79 * rounding, to produce the correct output. This division can be done
80 * cheaply as a right shift of CONST_BITS bits. We postpone shifting
81 * as long as possible so that partial sums can be added together with
82 * full fractional precision.
83 *
84 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
85 * they are represented to better-than-integral precision. These outputs
86 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
87 * with the recommended scaling. (To scale up 12-bit sample data further, an
88 * intermediate int32 array would be needed.)
89 *
90 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
91 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
92 * shows that the values given below are the most effective.
93 */
94
95#ifdef EIGHT_BIT_SAMPLES
96#define PASS1_BITS 2
97#else
98#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
99#endif
100
0c1a9eda 101#define ONE ((int32_t) 1)
de6d9b64
FB
102
103#define CONST_SCALE (ONE << CONST_BITS)
104
105/* Convert a positive real constant to an integer scaled by CONST_SCALE.
106 * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
107 * you will pay a significant penalty in run time. In that case, figure
108 * the correct integer constant values and insert them by hand.
109 */
110
111/* Actually FIX is no longer used, we precomputed them all */
0c1a9eda 112#define FIX(x) ((int32_t) ((x) * CONST_SCALE + 0.5))
de6d9b64 113
0c1a9eda 114/* Descale and correctly round an int32_t value that's scaled by N bits.
de6d9b64
FB
115 * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
116 * the fudge factor is correct for either sign of X.
117 */
118
119#define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
120
0c1a9eda 121/* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.
de6d9b64
FB
122 * For 8-bit samples with the recommended scaling, all the variable
123 * and constant values involved are no more than 16 bits wide, so a
124 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
125 * this provides a useful speedup on many machines.
126 * There is no way to specify a 16x16->32 multiply in portable C, but
127 * some C compilers will do the right thing if you provide the correct
128 * combination of casts.
129 * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
130 */
131
132#ifdef EIGHT_BIT_SAMPLES
133#ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */
0c1a9eda 134#define MULTIPLY(var,const) (((int16_t) (var)) * ((int16_t) (const)))
de6d9b64
FB
135#endif
136#ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */
0c1a9eda 137#define MULTIPLY(var,const) (((int16_t) (var)) * ((int32_t) (const)))
de6d9b64
FB
138#endif
139#endif
140
141#ifndef MULTIPLY /* default definition */
142#define MULTIPLY(var,const) ((var) * (const))
143#endif
144
145
146/*
147 Unlike our decoder where we approximate the FIXes, we need to use exact
148ones here or successive P-frames will drift too much with Reference frame coding
149*/
150#define FIX_0_211164243 1730
151#define FIX_0_275899380 2260
152#define FIX_0_298631336 2446
153#define FIX_0_390180644 3196
154#define FIX_0_509795579 4176
155#define FIX_0_541196100 4433
156#define FIX_0_601344887 4926
157#define FIX_0_765366865 6270
158#define FIX_0_785694958 6436
159#define FIX_0_899976223 7373
160#define FIX_1_061594337 8697
161#define FIX_1_111140466 9102
162#define FIX_1_175875602 9633
163#define FIX_1_306562965 10703
164#define FIX_1_387039845 11363
165#define FIX_1_451774981 11893
166#define FIX_1_501321110 12299
167#define FIX_1_662939225 13623
168#define FIX_1_847759065 15137
169#define FIX_1_961570560 16069
170#define FIX_2_053119869 16819
171#define FIX_2_172734803 17799
172#define FIX_2_562915447 20995
173#define FIX_3_072711026 25172
174
175/*
176 * Perform the inverse DCT on one block of coefficients.
177 */
178
179void j_rev_dct(DCTBLOCK data)
180{
0c1a9eda
ZK
181 int32_t tmp0, tmp1, tmp2, tmp3;
182 int32_t tmp10, tmp11, tmp12, tmp13;
183 int32_t z1, z2, z3, z4, z5;
184 int32_t d0, d1, d2, d3, d4, d5, d6, d7;
de6d9b64
FB
185 register DCTELEM *dataptr;
186 int rowctr;
187
188 /* Pass 1: process rows. */
189 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
190 /* furthermore, we scale the results by 2**PASS1_BITS. */
191
192 dataptr = data;
193
194 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
195 /* Due to quantization, we will usually find that many of the input
196 * coefficients are zero, especially the AC terms. We can exploit this
197 * by short-circuiting the IDCT calculation for any row in which all
198 * the AC terms are zero. In that case each output is equal to the
199 * DC coefficient (with scale factor as needed).
200 * With typical images and quantization tables, half or more of the
201 * row DCT calculations can be simplified this way.
202 */
203
204 register int *idataptr = (int*)dataptr;
205
13b54752
FB
206 /* WARNING: we do the same permutation as MMX idct to simplify the
207 video core */
de6d9b64 208 d0 = dataptr[0];
13b54752
FB
209 d2 = dataptr[1];
210 d4 = dataptr[2];
211 d6 = dataptr[3];
212 d1 = dataptr[4];
213 d3 = dataptr[5];
214 d5 = dataptr[6];
de6d9b64
FB
215 d7 = dataptr[7];
216
13b54752 217 if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
de6d9b64
FB
218 /* AC terms all zero */
219 if (d0) {
220 /* Compute a 32 bit value to assign. */
221 DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS);
222 register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
223
224 idataptr[0] = v;
225 idataptr[1] = v;
226 idataptr[2] = v;
227 idataptr[3] = v;
228 }
229
230 dataptr += DCTSIZE; /* advance pointer to next row */
231 continue;
232 }
233
234 /* Even part: reverse the even part of the forward DCT. */
235 /* The rotator is sqrt(2)*c(-6). */
236{
237 if (d6) {
238 if (d4) {
239 if (d2) {
240 if (d0) {
241 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
242 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
243 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
244 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
245
246 tmp0 = (d0 + d4) << CONST_BITS;
247 tmp1 = (d0 - d4) << CONST_BITS;
248
249 tmp10 = tmp0 + tmp3;
250 tmp13 = tmp0 - tmp3;
251 tmp11 = tmp1 + tmp2;
252 tmp12 = tmp1 - tmp2;
253 } else {
254 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
255 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
256 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
257 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
258
259 tmp0 = d4 << CONST_BITS;
260
261 tmp10 = tmp0 + tmp3;
262 tmp13 = tmp0 - tmp3;
263 tmp11 = tmp2 - tmp0;
264 tmp12 = -(tmp0 + tmp2);
265 }
266 } else {
267 if (d0) {
268 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
269 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
270 tmp3 = MULTIPLY(d6, FIX_0_541196100);
271
272 tmp0 = (d0 + d4) << CONST_BITS;
273 tmp1 = (d0 - d4) << CONST_BITS;
274
275 tmp10 = tmp0 + tmp3;
276 tmp13 = tmp0 - tmp3;
277 tmp11 = tmp1 + tmp2;
278 tmp12 = tmp1 - tmp2;
279 } else {
280 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
281 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
282 tmp3 = MULTIPLY(d6, FIX_0_541196100);
283
284 tmp0 = d4 << CONST_BITS;
285
286 tmp10 = tmp0 + tmp3;
287 tmp13 = tmp0 - tmp3;
288 tmp11 = tmp2 - tmp0;
289 tmp12 = -(tmp0 + tmp2);
290 }
291 }
292 } else {
293 if (d2) {
294 if (d0) {
295 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
296 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
297 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
298 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
299
300 tmp0 = d0 << CONST_BITS;
301
302 tmp10 = tmp0 + tmp3;
303 tmp13 = tmp0 - tmp3;
304 tmp11 = tmp0 + tmp2;
305 tmp12 = tmp0 - tmp2;
306 } else {
307 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
308 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
309 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
310 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
311
312 tmp10 = tmp3;
313 tmp13 = -tmp3;
314 tmp11 = tmp2;
315 tmp12 = -tmp2;
316 }
317 } else {
318 if (d0) {
319 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
320 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
321 tmp3 = MULTIPLY(d6, FIX_0_541196100);
322
323 tmp0 = d0 << CONST_BITS;
324
325 tmp10 = tmp0 + tmp3;
326 tmp13 = tmp0 - tmp3;
327 tmp11 = tmp0 + tmp2;
328 tmp12 = tmp0 - tmp2;
329 } else {
330 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
331 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
332 tmp3 = MULTIPLY(d6, FIX_0_541196100);
333
334 tmp10 = tmp3;
335 tmp13 = -tmp3;
336 tmp11 = tmp2;
337 tmp12 = -tmp2;
338 }
339 }
340 }
341 } else {
342 if (d4) {
343 if (d2) {
344 if (d0) {
345 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
346 tmp2 = MULTIPLY(d2, FIX_0_541196100);
347 tmp3 = MULTIPLY(d2, FIX_1_306562965);
348
349 tmp0 = (d0 + d4) << CONST_BITS;
350 tmp1 = (d0 - d4) << CONST_BITS;
351
352 tmp10 = tmp0 + tmp3;
353 tmp13 = tmp0 - tmp3;
354 tmp11 = tmp1 + tmp2;
355 tmp12 = tmp1 - tmp2;
356 } else {
357 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
358 tmp2 = MULTIPLY(d2, FIX_0_541196100);
359 tmp3 = MULTIPLY(d2, FIX_1_306562965);
360
361 tmp0 = d4 << CONST_BITS;
362
363 tmp10 = tmp0 + tmp3;
364 tmp13 = tmp0 - tmp3;
365 tmp11 = tmp2 - tmp0;
366 tmp12 = -(tmp0 + tmp2);
367 }
368 } else {
369 if (d0) {
370 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
371 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
372 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
373 } else {
374 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
375 tmp10 = tmp13 = d4 << CONST_BITS;
376 tmp11 = tmp12 = -tmp10;
377 }
378 }
379 } else {
380 if (d2) {
381 if (d0) {
382 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
383 tmp2 = MULTIPLY(d2, FIX_0_541196100);
384 tmp3 = MULTIPLY(d2, FIX_1_306562965);
385
386 tmp0 = d0 << CONST_BITS;
387
388 tmp10 = tmp0 + tmp3;
389 tmp13 = tmp0 - tmp3;
390 tmp11 = tmp0 + tmp2;
391 tmp12 = tmp0 - tmp2;
392 } else {
393 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
394 tmp2 = MULTIPLY(d2, FIX_0_541196100);
395 tmp3 = MULTIPLY(d2, FIX_1_306562965);
396
397 tmp10 = tmp3;
398 tmp13 = -tmp3;
399 tmp11 = tmp2;
400 tmp12 = -tmp2;
401 }
402 } else {
403 if (d0) {
404 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
405 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
406 } else {
407 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
408 tmp10 = tmp13 = tmp11 = tmp12 = 0;
409 }
410 }
411 }
412 }
413
414 /* Odd part per figure 8; the matrix is unitary and hence its
415 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
416 */
417
418 if (d7) {
419 if (d5) {
420 if (d3) {
421 if (d1) {
422 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
423 z1 = d7 + d1;
424 z2 = d5 + d3;
425 z3 = d7 + d3;
426 z4 = d5 + d1;
427 z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
428
429 tmp0 = MULTIPLY(d7, FIX_0_298631336);
430 tmp1 = MULTIPLY(d5, FIX_2_053119869);
431 tmp2 = MULTIPLY(d3, FIX_3_072711026);
432 tmp3 = MULTIPLY(d1, FIX_1_501321110);
433 z1 = MULTIPLY(-z1, FIX_0_899976223);
434 z2 = MULTIPLY(-z2, FIX_2_562915447);
435 z3 = MULTIPLY(-z3, FIX_1_961570560);
436 z4 = MULTIPLY(-z4, FIX_0_390180644);
437
438 z3 += z5;
439 z4 += z5;
440
441 tmp0 += z1 + z3;
442 tmp1 += z2 + z4;
443 tmp2 += z2 + z3;
444 tmp3 += z1 + z4;
445 } else {
446 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
447 z2 = d5 + d3;
448 z3 = d7 + d3;
449 z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
450
451 tmp0 = MULTIPLY(d7, FIX_0_298631336);
452 tmp1 = MULTIPLY(d5, FIX_2_053119869);
453 tmp2 = MULTIPLY(d3, FIX_3_072711026);
454 z1 = MULTIPLY(-d7, FIX_0_899976223);
455 z2 = MULTIPLY(-z2, FIX_2_562915447);
456 z3 = MULTIPLY(-z3, FIX_1_961570560);
457 z4 = MULTIPLY(-d5, FIX_0_390180644);
458
459 z3 += z5;
460 z4 += z5;
461
462 tmp0 += z1 + z3;
463 tmp1 += z2 + z4;
464 tmp2 += z2 + z3;
465 tmp3 = z1 + z4;
466 }
467 } else {
468 if (d1) {
469 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
470 z1 = d7 + d1;
471 z4 = d5 + d1;
472 z5 = MULTIPLY(d7 + z4, FIX_1_175875602);
473
474 tmp0 = MULTIPLY(d7, FIX_0_298631336);
475 tmp1 = MULTIPLY(d5, FIX_2_053119869);
476 tmp3 = MULTIPLY(d1, FIX_1_501321110);
477 z1 = MULTIPLY(-z1, FIX_0_899976223);
478 z2 = MULTIPLY(-d5, FIX_2_562915447);
479 z3 = MULTIPLY(-d7, FIX_1_961570560);
480 z4 = MULTIPLY(-z4, FIX_0_390180644);
481
482 z3 += z5;
483 z4 += z5;
484
485 tmp0 += z1 + z3;
486 tmp1 += z2 + z4;
487 tmp2 = z2 + z3;
488 tmp3 += z1 + z4;
489 } else {
490 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
491 tmp0 = MULTIPLY(-d7, FIX_0_601344887);
492 z1 = MULTIPLY(-d7, FIX_0_899976223);
493 z3 = MULTIPLY(-d7, FIX_1_961570560);
494 tmp1 = MULTIPLY(-d5, FIX_0_509795579);
495 z2 = MULTIPLY(-d5, FIX_2_562915447);
496 z4 = MULTIPLY(-d5, FIX_0_390180644);
497 z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
498
499 z3 += z5;
500 z4 += z5;
501
502 tmp0 += z3;
503 tmp1 += z4;
504 tmp2 = z2 + z3;
505 tmp3 = z1 + z4;
506 }
507 }
508 } else {
509 if (d3) {
510 if (d1) {
511 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
512 z1 = d7 + d1;
513 z3 = d7 + d3;
514 z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
515
516 tmp0 = MULTIPLY(d7, FIX_0_298631336);
517 tmp2 = MULTIPLY(d3, FIX_3_072711026);
518 tmp3 = MULTIPLY(d1, FIX_1_501321110);
519 z1 = MULTIPLY(-z1, FIX_0_899976223);
520 z2 = MULTIPLY(-d3, FIX_2_562915447);
521 z3 = MULTIPLY(-z3, FIX_1_961570560);
522 z4 = MULTIPLY(-d1, FIX_0_390180644);
523
524 z3 += z5;
525 z4 += z5;
526
527 tmp0 += z1 + z3;
528 tmp1 = z2 + z4;
529 tmp2 += z2 + z3;
530 tmp3 += z1 + z4;
531 } else {
532 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
533 z3 = d7 + d3;
534
535 tmp0 = MULTIPLY(-d7, FIX_0_601344887);
536 z1 = MULTIPLY(-d7, FIX_0_899976223);
537 tmp2 = MULTIPLY(d3, FIX_0_509795579);
538 z2 = MULTIPLY(-d3, FIX_2_562915447);
539 z5 = MULTIPLY(z3, FIX_1_175875602);
540 z3 = MULTIPLY(-z3, FIX_0_785694958);
541
542 tmp0 += z3;
543 tmp1 = z2 + z5;
544 tmp2 += z3;
545 tmp3 = z1 + z5;
546 }
547 } else {
548 if (d1) {
549 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
550 z1 = d7 + d1;
551 z5 = MULTIPLY(z1, FIX_1_175875602);
552
553 z1 = MULTIPLY(z1, FIX_0_275899380);
554 z3 = MULTIPLY(-d7, FIX_1_961570560);
555 tmp0 = MULTIPLY(-d7, FIX_1_662939225);
556 z4 = MULTIPLY(-d1, FIX_0_390180644);
557 tmp3 = MULTIPLY(d1, FIX_1_111140466);
558
559 tmp0 += z1;
560 tmp1 = z4 + z5;
561 tmp2 = z3 + z5;
562 tmp3 += z1;
563 } else {
564 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
565 tmp0 = MULTIPLY(-d7, FIX_1_387039845);
566 tmp1 = MULTIPLY(d7, FIX_1_175875602);
567 tmp2 = MULTIPLY(-d7, FIX_0_785694958);
568 tmp3 = MULTIPLY(d7, FIX_0_275899380);
569 }
570 }
571 }
572 } else {
573 if (d5) {
574 if (d3) {
575 if (d1) {
576 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
577 z2 = d5 + d3;
578 z4 = d5 + d1;
579 z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
580
581 tmp1 = MULTIPLY(d5, FIX_2_053119869);
582 tmp2 = MULTIPLY(d3, FIX_3_072711026);
583 tmp3 = MULTIPLY(d1, FIX_1_501321110);
584 z1 = MULTIPLY(-d1, FIX_0_899976223);
585 z2 = MULTIPLY(-z2, FIX_2_562915447);
586 z3 = MULTIPLY(-d3, FIX_1_961570560);
587 z4 = MULTIPLY(-z4, FIX_0_390180644);
588
589 z3 += z5;
590 z4 += z5;
591
592 tmp0 = z1 + z3;
593 tmp1 += z2 + z4;
594 tmp2 += z2 + z3;
595 tmp3 += z1 + z4;
596 } else {
597 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
598 z2 = d5 + d3;
599
600 z5 = MULTIPLY(z2, FIX_1_175875602);
601 tmp1 = MULTIPLY(d5, FIX_1_662939225);
602 z4 = MULTIPLY(-d5, FIX_0_390180644);
603 z2 = MULTIPLY(-z2, FIX_1_387039845);
604 tmp2 = MULTIPLY(d3, FIX_1_111140466);
605 z3 = MULTIPLY(-d3, FIX_1_961570560);
606
607 tmp0 = z3 + z5;
608 tmp1 += z2;
609 tmp2 += z2;
610 tmp3 = z4 + z5;
611 }
612 } else {
613 if (d1) {
614 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
615 z4 = d5 + d1;
616
617 z5 = MULTIPLY(z4, FIX_1_175875602);
618 z1 = MULTIPLY(-d1, FIX_0_899976223);
619 tmp3 = MULTIPLY(d1, FIX_0_601344887);
620 tmp1 = MULTIPLY(-d5, FIX_0_509795579);
621 z2 = MULTIPLY(-d5, FIX_2_562915447);
622 z4 = MULTIPLY(z4, FIX_0_785694958);
623
624 tmp0 = z1 + z5;
625 tmp1 += z4;
626 tmp2 = z2 + z5;
627 tmp3 += z4;
628 } else {
629 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
630 tmp0 = MULTIPLY(d5, FIX_1_175875602);
631 tmp1 = MULTIPLY(d5, FIX_0_275899380);
632 tmp2 = MULTIPLY(-d5, FIX_1_387039845);
633 tmp3 = MULTIPLY(d5, FIX_0_785694958);
634 }
635 }
636 } else {
637 if (d3) {
638 if (d1) {
639 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
640 z5 = d1 + d3;
641 tmp3 = MULTIPLY(d1, FIX_0_211164243);
642 tmp2 = MULTIPLY(-d3, FIX_1_451774981);
643 z1 = MULTIPLY(d1, FIX_1_061594337);
644 z2 = MULTIPLY(-d3, FIX_2_172734803);
645 z4 = MULTIPLY(z5, FIX_0_785694958);
646 z5 = MULTIPLY(z5, FIX_1_175875602);
647
648 tmp0 = z1 - z4;
649 tmp1 = z2 + z4;
650 tmp2 += z5;
651 tmp3 += z5;
652 } else {
653 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
654 tmp0 = MULTIPLY(-d3, FIX_0_785694958);
655 tmp1 = MULTIPLY(-d3, FIX_1_387039845);
656 tmp2 = MULTIPLY(-d3, FIX_0_275899380);
657 tmp3 = MULTIPLY(d3, FIX_1_175875602);
658 }
659 } else {
660 if (d1) {
661 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
662 tmp0 = MULTIPLY(d1, FIX_0_275899380);
663 tmp1 = MULTIPLY(d1, FIX_0_785694958);
664 tmp2 = MULTIPLY(d1, FIX_1_175875602);
665 tmp3 = MULTIPLY(d1, FIX_1_387039845);
666 } else {
667 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
668 tmp0 = tmp1 = tmp2 = tmp3 = 0;
669 }
670 }
671 }
672 }
673}
674 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
675
676 dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
677 dataptr[7] = (DCTELEM) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
678 dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
679 dataptr[6] = (DCTELEM) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
680 dataptr[2] = (DCTELEM) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
681 dataptr[5] = (DCTELEM) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
682 dataptr[3] = (DCTELEM) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
683 dataptr[4] = (DCTELEM) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
684
685 dataptr += DCTSIZE; /* advance pointer to next row */
686 }
687
688 /* Pass 2: process columns. */
689 /* Note that we must descale the results by a factor of 8 == 2**3, */
690 /* and also undo the PASS1_BITS scaling. */
691
692 dataptr = data;
693 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
694 /* Columns of zeroes can be exploited in the same way as we did with rows.
695 * However, the row calculation has created many nonzero AC terms, so the
696 * simplification applies less often (typically 5% to 10% of the time).
697 * On machines with very fast multiplication, it's possible that the
698 * test takes more time than it's worth. In that case this section
699 * may be commented out.
700 */
701
702 d0 = dataptr[DCTSIZE*0];
703 d1 = dataptr[DCTSIZE*1];
704 d2 = dataptr[DCTSIZE*2];
705 d3 = dataptr[DCTSIZE*3];
706 d4 = dataptr[DCTSIZE*4];
707 d5 = dataptr[DCTSIZE*5];
708 d6 = dataptr[DCTSIZE*6];
709 d7 = dataptr[DCTSIZE*7];
710
711 /* Even part: reverse the even part of the forward DCT. */
712 /* The rotator is sqrt(2)*c(-6). */
713 if (d6) {
714 if (d4) {
715 if (d2) {
716 if (d0) {
717 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
718 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
719 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
720 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
721
722 tmp0 = (d0 + d4) << CONST_BITS;
723 tmp1 = (d0 - d4) << CONST_BITS;
724
725 tmp10 = tmp0 + tmp3;
726 tmp13 = tmp0 - tmp3;
727 tmp11 = tmp1 + tmp2;
728 tmp12 = tmp1 - tmp2;
729 } else {
730 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
731 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
732 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
733 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
734
735 tmp0 = d4 << CONST_BITS;
736
737 tmp10 = tmp0 + tmp3;
738 tmp13 = tmp0 - tmp3;
739 tmp11 = tmp2 - tmp0;
740 tmp12 = -(tmp0 + tmp2);
741 }
742 } else {
743 if (d0) {
744 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
745 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
746 tmp3 = MULTIPLY(d6, FIX_0_541196100);
747
748 tmp0 = (d0 + d4) << CONST_BITS;
749 tmp1 = (d0 - d4) << CONST_BITS;
750
751 tmp10 = tmp0 + tmp3;
752 tmp13 = tmp0 - tmp3;
753 tmp11 = tmp1 + tmp2;
754 tmp12 = tmp1 - tmp2;
755 } else {
756 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
757 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
758 tmp3 = MULTIPLY(d6, FIX_0_541196100);
759
760 tmp0 = d4 << CONST_BITS;
761
762 tmp10 = tmp0 + tmp3;
763 tmp13 = tmp0 - tmp3;
764 tmp11 = tmp2 - tmp0;
765 tmp12 = -(tmp0 + tmp2);
766 }
767 }
768 } else {
769 if (d2) {
770 if (d0) {
771 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
772 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
773 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
774 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
775
776 tmp0 = d0 << CONST_BITS;
777
778 tmp10 = tmp0 + tmp3;
779 tmp13 = tmp0 - tmp3;
780 tmp11 = tmp0 + tmp2;
781 tmp12 = tmp0 - tmp2;
782 } else {
783 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
784 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
785 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
786 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
787
788 tmp10 = tmp3;
789 tmp13 = -tmp3;
790 tmp11 = tmp2;
791 tmp12 = -tmp2;
792 }
793 } else {
794 if (d0) {
795 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
796 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
797 tmp3 = MULTIPLY(d6, FIX_0_541196100);
798
799 tmp0 = d0 << CONST_BITS;
800
801 tmp10 = tmp0 + tmp3;
802 tmp13 = tmp0 - tmp3;
803 tmp11 = tmp0 + tmp2;
804 tmp12 = tmp0 - tmp2;
805 } else {
806 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
807 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
808 tmp3 = MULTIPLY(d6, FIX_0_541196100);
809
810 tmp10 = tmp3;
811 tmp13 = -tmp3;
812 tmp11 = tmp2;
813 tmp12 = -tmp2;
814 }
815 }
816 }
817 } else {
818 if (d4) {
819 if (d2) {
820 if (d0) {
821 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
822 tmp2 = MULTIPLY(d2, FIX_0_541196100);
823 tmp3 = MULTIPLY(d2, FIX_1_306562965);
824
825 tmp0 = (d0 + d4) << CONST_BITS;
826 tmp1 = (d0 - d4) << CONST_BITS;
827
828 tmp10 = tmp0 + tmp3;
829 tmp13 = tmp0 - tmp3;
830 tmp11 = tmp1 + tmp2;
831 tmp12 = tmp1 - tmp2;
832 } else {
833 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
834 tmp2 = MULTIPLY(d2, FIX_0_541196100);
835 tmp3 = MULTIPLY(d2, FIX_1_306562965);
836
837 tmp0 = d4 << CONST_BITS;
838
839 tmp10 = tmp0 + tmp3;
840 tmp13 = tmp0 - tmp3;
841 tmp11 = tmp2 - tmp0;
842 tmp12 = -(tmp0 + tmp2);
843 }
844 } else {
845 if (d0) {
846 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
847 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
848 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
849 } else {
850 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
851 tmp10 = tmp13 = d4 << CONST_BITS;
852 tmp11 = tmp12 = -tmp10;
853 }
854 }
855 } else {
856 if (d2) {
857 if (d0) {
858 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
859 tmp2 = MULTIPLY(d2, FIX_0_541196100);
860 tmp3 = MULTIPLY(d2, FIX_1_306562965);
861
862 tmp0 = d0 << CONST_BITS;
863
864 tmp10 = tmp0 + tmp3;
865 tmp13 = tmp0 - tmp3;
866 tmp11 = tmp0 + tmp2;
867 tmp12 = tmp0 - tmp2;
868 } else {
869 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
870 tmp2 = MULTIPLY(d2, FIX_0_541196100);
871 tmp3 = MULTIPLY(d2, FIX_1_306562965);
872
873 tmp10 = tmp3;
874 tmp13 = -tmp3;
875 tmp11 = tmp2;
876 tmp12 = -tmp2;
877 }
878 } else {
879 if (d0) {
880 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
881 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
882 } else {
883 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
884 tmp10 = tmp13 = tmp11 = tmp12 = 0;
885 }
886 }
887 }
888 }
889
890 /* Odd part per figure 8; the matrix is unitary and hence its
891 * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
892 */
893 if (d7) {
894 if (d5) {
895 if (d3) {
896 if (d1) {
897 /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
898 z1 = d7 + d1;
899 z2 = d5 + d3;
900 z3 = d7 + d3;
901 z4 = d5 + d1;
902 z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
903
904 tmp0 = MULTIPLY(d7, FIX_0_298631336);
905 tmp1 = MULTIPLY(d5, FIX_2_053119869);
906 tmp2 = MULTIPLY(d3, FIX_3_072711026);
907 tmp3 = MULTIPLY(d1, FIX_1_501321110);
908 z1 = MULTIPLY(-z1, FIX_0_899976223);
909 z2 = MULTIPLY(-z2, FIX_2_562915447);
910 z3 = MULTIPLY(-z3, FIX_1_961570560);
911 z4 = MULTIPLY(-z4, FIX_0_390180644);
912
913 z3 += z5;
914 z4 += z5;
915
916 tmp0 += z1 + z3;
917 tmp1 += z2 + z4;
918 tmp2 += z2 + z3;
919 tmp3 += z1 + z4;
920 } else {
921 /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
922 z1 = d7;
923 z2 = d5 + d3;
924 z3 = d7 + d3;
925 z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
926
927 tmp0 = MULTIPLY(d7, FIX_0_298631336);
928 tmp1 = MULTIPLY(d5, FIX_2_053119869);
929 tmp2 = MULTIPLY(d3, FIX_3_072711026);
930 z1 = MULTIPLY(-d7, FIX_0_899976223);
931 z2 = MULTIPLY(-z2, FIX_2_562915447);
932 z3 = MULTIPLY(-z3, FIX_1_961570560);
933 z4 = MULTIPLY(-d5, FIX_0_390180644);
934
935 z3 += z5;
936 z4 += z5;
937
938 tmp0 += z1 + z3;
939 tmp1 += z2 + z4;
940 tmp2 += z2 + z3;
941 tmp3 = z1 + z4;
942 }
943 } else {
944 if (d1) {
945 /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
946 z1 = d7 + d1;
947 z2 = d5;
948 z3 = d7;
949 z4 = d5 + d1;
950 z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
951
952 tmp0 = MULTIPLY(d7, FIX_0_298631336);
953 tmp1 = MULTIPLY(d5, FIX_2_053119869);
954 tmp3 = MULTIPLY(d1, FIX_1_501321110);
955 z1 = MULTIPLY(-z1, FIX_0_899976223);
956 z2 = MULTIPLY(-d5, FIX_2_562915447);
957 z3 = MULTIPLY(-d7, FIX_1_961570560);
958 z4 = MULTIPLY(-z4, FIX_0_390180644);
959
960 z3 += z5;
961 z4 += z5;
962
963 tmp0 += z1 + z3;
964 tmp1 += z2 + z4;
965 tmp2 = z2 + z3;
966 tmp3 += z1 + z4;
967 } else {
968 /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
969 tmp0 = MULTIPLY(-d7, FIX_0_601344887);
970 z1 = MULTIPLY(-d7, FIX_0_899976223);
971 z3 = MULTIPLY(-d7, FIX_1_961570560);
972 tmp1 = MULTIPLY(-d5, FIX_0_509795579);
973 z2 = MULTIPLY(-d5, FIX_2_562915447);
974 z4 = MULTIPLY(-d5, FIX_0_390180644);
975 z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
976
977 z3 += z5;
978 z4 += z5;
979
980 tmp0 += z3;
981 tmp1 += z4;
982 tmp2 = z2 + z3;
983 tmp3 = z1 + z4;
984 }
985 }
986 } else {
987 if (d3) {
988 if (d1) {
989 /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
990 z1 = d7 + d1;
991 z3 = d7 + d3;
992 z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
993
994 tmp0 = MULTIPLY(d7, FIX_0_298631336);
995 tmp2 = MULTIPLY(d3, FIX_3_072711026);
996 tmp3 = MULTIPLY(d1, FIX_1_501321110);
997 z1 = MULTIPLY(-z1, FIX_0_899976223);
998 z2 = MULTIPLY(-d3, FIX_2_562915447);
999 z3 = MULTIPLY(-z3, FIX_1_961570560);
1000 z4 = MULTIPLY(-d1, FIX_0_390180644);
1001
1002 z3 += z5;
1003 z4 += z5;
1004
1005 tmp0 += z1 + z3;
1006 tmp1 = z2 + z4;
1007 tmp2 += z2 + z3;
1008 tmp3 += z1 + z4;
1009 } else {
1010 /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
1011 z3 = d7 + d3;
1012
1013 tmp0 = MULTIPLY(-d7, FIX_0_601344887);
1014 z1 = MULTIPLY(-d7, FIX_0_899976223);
1015 tmp2 = MULTIPLY(d3, FIX_0_509795579);
1016 z2 = MULTIPLY(-d3, FIX_2_562915447);
1017 z5 = MULTIPLY(z3, FIX_1_175875602);
1018 z3 = MULTIPLY(-z3, FIX_0_785694958);
1019
1020 tmp0 += z3;
1021 tmp1 = z2 + z5;
1022 tmp2 += z3;
1023 tmp3 = z1 + z5;
1024 }
1025 } else {
1026 if (d1) {
1027 /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
1028 z1 = d7 + d1;
1029 z5 = MULTIPLY(z1, FIX_1_175875602);
1030
1031 z1 = MULTIPLY(z1, FIX_0_275899380);
1032 z3 = MULTIPLY(-d7, FIX_1_961570560);
1033 tmp0 = MULTIPLY(-d7, FIX_1_662939225);
1034 z4 = MULTIPLY(-d1, FIX_0_390180644);
1035 tmp3 = MULTIPLY(d1, FIX_1_111140466);
1036
1037 tmp0 += z1;
1038 tmp1 = z4 + z5;
1039 tmp2 = z3 + z5;
1040 tmp3 += z1;
1041 } else {
1042 /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
1043 tmp0 = MULTIPLY(-d7, FIX_1_387039845);
1044 tmp1 = MULTIPLY(d7, FIX_1_175875602);
1045 tmp2 = MULTIPLY(-d7, FIX_0_785694958);
1046 tmp3 = MULTIPLY(d7, FIX_0_275899380);
1047 }
1048 }
1049 }
1050 } else {
1051 if (d5) {
1052 if (d3) {
1053 if (d1) {
1054 /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
1055 z2 = d5 + d3;
1056 z4 = d5 + d1;
1057 z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
1058
1059 tmp1 = MULTIPLY(d5, FIX_2_053119869);
1060 tmp2 = MULTIPLY(d3, FIX_3_072711026);
1061 tmp3 = MULTIPLY(d1, FIX_1_501321110);
1062 z1 = MULTIPLY(-d1, FIX_0_899976223);
1063 z2 = MULTIPLY(-z2, FIX_2_562915447);
1064 z3 = MULTIPLY(-d3, FIX_1_961570560);
1065 z4 = MULTIPLY(-z4, FIX_0_390180644);
1066
1067 z3 += z5;
1068 z4 += z5;
1069
1070 tmp0 = z1 + z3;
1071 tmp1 += z2 + z4;
1072 tmp2 += z2 + z3;
1073 tmp3 += z1 + z4;
1074 } else {
1075 /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
1076 z2 = d5 + d3;
1077
1078 z5 = MULTIPLY(z2, FIX_1_175875602);
1079 tmp1 = MULTIPLY(d5, FIX_1_662939225);
1080 z4 = MULTIPLY(-d5, FIX_0_390180644);
1081 z2 = MULTIPLY(-z2, FIX_1_387039845);
1082 tmp2 = MULTIPLY(d3, FIX_1_111140466);
1083 z3 = MULTIPLY(-d3, FIX_1_961570560);
1084
1085 tmp0 = z3 + z5;
1086 tmp1 += z2;
1087 tmp2 += z2;
1088 tmp3 = z4 + z5;
1089 }
1090 } else {
1091 if (d1) {
1092 /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
1093 z4 = d5 + d1;
1094
1095 z5 = MULTIPLY(z4, FIX_1_175875602);
1096 z1 = MULTIPLY(-d1, FIX_0_899976223);
1097 tmp3 = MULTIPLY(d1, FIX_0_601344887);
1098 tmp1 = MULTIPLY(-d5, FIX_0_509795579);
1099 z2 = MULTIPLY(-d5, FIX_2_562915447);
1100 z4 = MULTIPLY(z4, FIX_0_785694958);
1101
1102 tmp0 = z1 + z5;
1103 tmp1 += z4;
1104 tmp2 = z2 + z5;
1105 tmp3 += z4;
1106 } else {
1107 /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
1108 tmp0 = MULTIPLY(d5, FIX_1_175875602);
1109 tmp1 = MULTIPLY(d5, FIX_0_275899380);
1110 tmp2 = MULTIPLY(-d5, FIX_1_387039845);
1111 tmp3 = MULTIPLY(d5, FIX_0_785694958);
1112 }
1113 }
1114 } else {
1115 if (d3) {
1116 if (d1) {
1117 /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
1118 z5 = d1 + d3;
1119 tmp3 = MULTIPLY(d1, FIX_0_211164243);
1120 tmp2 = MULTIPLY(-d3, FIX_1_451774981);
1121 z1 = MULTIPLY(d1, FIX_1_061594337);
1122 z2 = MULTIPLY(-d3, FIX_2_172734803);
1123 z4 = MULTIPLY(z5, FIX_0_785694958);
1124 z5 = MULTIPLY(z5, FIX_1_175875602);
1125
1126 tmp0 = z1 - z4;
1127 tmp1 = z2 + z4;
1128 tmp2 += z5;
1129 tmp3 += z5;
1130 } else {
1131 /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
1132 tmp0 = MULTIPLY(-d3, FIX_0_785694958);
1133 tmp1 = MULTIPLY(-d3, FIX_1_387039845);
1134 tmp2 = MULTIPLY(-d3, FIX_0_275899380);
1135 tmp3 = MULTIPLY(d3, FIX_1_175875602);
1136 }
1137 } else {
1138 if (d1) {
1139 /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
1140 tmp0 = MULTIPLY(d1, FIX_0_275899380);
1141 tmp1 = MULTIPLY(d1, FIX_0_785694958);
1142 tmp2 = MULTIPLY(d1, FIX_1_175875602);
1143 tmp3 = MULTIPLY(d1, FIX_1_387039845);
1144 } else {
1145 /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
1146 tmp0 = tmp1 = tmp2 = tmp3 = 0;
1147 }
1148 }
1149 }
1150 }
1151
1152 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1153
1154 dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp3,
1155 CONST_BITS+PASS1_BITS+3);
1156 dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp10 - tmp3,
1157 CONST_BITS+PASS1_BITS+3);
1158 dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp11 + tmp2,
1159 CONST_BITS+PASS1_BITS+3);
1160 dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(tmp11 - tmp2,
1161 CONST_BITS+PASS1_BITS+3);
1162 dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp12 + tmp1,
1163 CONST_BITS+PASS1_BITS+3);
1164 dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12 - tmp1,
1165 CONST_BITS+PASS1_BITS+3);
1166 dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp13 + tmp0,
1167 CONST_BITS+PASS1_BITS+3);
1168 dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp13 - tmp0,
1169 CONST_BITS+PASS1_BITS+3);
1170
1171 dataptr++; /* advance pointer to next column */
1172 }
1173}
1174
178fcca8
MN
1175#undef DCTSIZE
1176#define DCTSIZE 4
1177#define DCTSTRIDE 8
1178
1179void j_rev_dct4(DCTBLOCK data)
1180{
1181 int32_t tmp0, tmp1, tmp2, tmp3;
1182 int32_t tmp10, tmp11, tmp12, tmp13;
1183 int32_t z1;
1184 int32_t d0, d2, d4, d6;
1185 register DCTELEM *dataptr;
1186 int rowctr;
1187
1188 /* Pass 1: process rows. */
1189 /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
1190 /* furthermore, we scale the results by 2**PASS1_BITS. */
1191
1192 dataptr = data;
1193
1194 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
1195 /* Due to quantization, we will usually find that many of the input
1196 * coefficients are zero, especially the AC terms. We can exploit this
1197 * by short-circuiting the IDCT calculation for any row in which all
1198 * the AC terms are zero. In that case each output is equal to the
1199 * DC coefficient (with scale factor as needed).
1200 * With typical images and quantization tables, half or more of the
1201 * row DCT calculations can be simplified this way.
1202 */
1203
1204 register int *idataptr = (int*)dataptr;
1205
1206 /* WARNING: we do the same permutation as MMX idct to simplify the
1207 video core */
1208 d0 = dataptr[0];
1209 d2 = dataptr[1];
1210 d4 = dataptr[2];
1211 d6 = dataptr[3];
1212
1213 if ((d2 | d4 | d6) == 0) {
1214 /* AC terms all zero */
1215 if (d0) {
1216 /* Compute a 32 bit value to assign. */
1217 DCTELEM dcval = (DCTELEM) (d0 << PASS1_BITS);
1218 register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
1219
1220 idataptr[0] = v;
1221 idataptr[1] = v;
1222 }
1223
1224 dataptr += DCTSTRIDE; /* advance pointer to next row */
1225 continue;
1226 }
1227
1228 /* Even part: reverse the even part of the forward DCT. */
1229 /* The rotator is sqrt(2)*c(-6). */
1230 if (d6) {
1231 if (d4) {
1232 if (d2) {
1233 if (d0) {
1234 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1235 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1236 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1237 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1238
1239 tmp0 = (d0 + d4) << CONST_BITS;
1240 tmp1 = (d0 - d4) << CONST_BITS;
1241
1242 tmp10 = tmp0 + tmp3;
1243 tmp13 = tmp0 - tmp3;
1244 tmp11 = tmp1 + tmp2;
1245 tmp12 = tmp1 - tmp2;
1246 } else {
1247 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
1248 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1249 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1250 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1251
1252 tmp0 = d4 << CONST_BITS;
1253
1254 tmp10 = tmp0 + tmp3;
1255 tmp13 = tmp0 - tmp3;
1256 tmp11 = tmp2 - tmp0;
1257 tmp12 = -(tmp0 + tmp2);
1258 }
1259 } else {
1260 if (d0) {
1261 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1262 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1263 tmp3 = MULTIPLY(d6, FIX_0_541196100);
1264
1265 tmp0 = (d0 + d4) << CONST_BITS;
1266 tmp1 = (d0 - d4) << CONST_BITS;
1267
1268 tmp10 = tmp0 + tmp3;
1269 tmp13 = tmp0 - tmp3;
1270 tmp11 = tmp1 + tmp2;
1271 tmp12 = tmp1 - tmp2;
1272 } else {
1273 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
1274 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1275 tmp3 = MULTIPLY(d6, FIX_0_541196100);
1276
1277 tmp0 = d4 << CONST_BITS;
1278
1279 tmp10 = tmp0 + tmp3;
1280 tmp13 = tmp0 - tmp3;
1281 tmp11 = tmp2 - tmp0;
1282 tmp12 = -(tmp0 + tmp2);
1283 }
1284 }
1285 } else {
1286 if (d2) {
1287 if (d0) {
1288 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
1289 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1290 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1291 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1292
1293 tmp0 = d0 << CONST_BITS;
1294
1295 tmp10 = tmp0 + tmp3;
1296 tmp13 = tmp0 - tmp3;
1297 tmp11 = tmp0 + tmp2;
1298 tmp12 = tmp0 - tmp2;
1299 } else {
1300 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
1301 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1302 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1303 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1304
1305 tmp10 = tmp3;
1306 tmp13 = -tmp3;
1307 tmp11 = tmp2;
1308 tmp12 = -tmp2;
1309 }
1310 } else {
1311 if (d0) {
1312 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
1313 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1314 tmp3 = MULTIPLY(d6, FIX_0_541196100);
1315
1316 tmp0 = d0 << CONST_BITS;
1317
1318 tmp10 = tmp0 + tmp3;
1319 tmp13 = tmp0 - tmp3;
1320 tmp11 = tmp0 + tmp2;
1321 tmp12 = tmp0 - tmp2;
1322 } else {
1323 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
1324 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1325 tmp3 = MULTIPLY(d6, FIX_0_541196100);
1326
1327 tmp10 = tmp3;
1328 tmp13 = -tmp3;
1329 tmp11 = tmp2;
1330 tmp12 = -tmp2;
1331 }
1332 }
1333 }
1334 } else {
1335 if (d4) {
1336 if (d2) {
1337 if (d0) {
1338 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1339 tmp2 = MULTIPLY(d2, FIX_0_541196100);
1340 tmp3 = MULTIPLY(d2, FIX_1_306562965);
1341
1342 tmp0 = (d0 + d4) << CONST_BITS;
1343 tmp1 = (d0 - d4) << CONST_BITS;
1344
1345 tmp10 = tmp0 + tmp3;
1346 tmp13 = tmp0 - tmp3;
1347 tmp11 = tmp1 + tmp2;
1348 tmp12 = tmp1 - tmp2;
1349 } else {
1350 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
1351 tmp2 = MULTIPLY(d2, FIX_0_541196100);
1352 tmp3 = MULTIPLY(d2, FIX_1_306562965);
1353
1354 tmp0 = d4 << CONST_BITS;
1355
1356 tmp10 = tmp0 + tmp3;
1357 tmp13 = tmp0 - tmp3;
1358 tmp11 = tmp2 - tmp0;
1359 tmp12 = -(tmp0 + tmp2);
1360 }
1361 } else {
1362 if (d0) {
1363 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1364 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1365 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1366 } else {
1367 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
1368 tmp10 = tmp13 = d4 << CONST_BITS;
1369 tmp11 = tmp12 = -tmp10;
1370 }
1371 }
1372 } else {
1373 if (d2) {
1374 if (d0) {
1375 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
1376 tmp2 = MULTIPLY(d2, FIX_0_541196100);
1377 tmp3 = MULTIPLY(d2, FIX_1_306562965);
1378
1379 tmp0 = d0 << CONST_BITS;
1380
1381 tmp10 = tmp0 + tmp3;
1382 tmp13 = tmp0 - tmp3;
1383 tmp11 = tmp0 + tmp2;
1384 tmp12 = tmp0 - tmp2;
1385 } else {
1386 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
1387 tmp2 = MULTIPLY(d2, FIX_0_541196100);
1388 tmp3 = MULTIPLY(d2, FIX_1_306562965);
1389
1390 tmp10 = tmp3;
1391 tmp13 = -tmp3;
1392 tmp11 = tmp2;
1393 tmp12 = -tmp2;
1394 }
1395 } else {
1396 if (d0) {
1397 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
1398 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
1399 } else {
1400 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
1401 tmp10 = tmp13 = tmp11 = tmp12 = 0;
1402 }
1403 }
1404 }
1405 }
1406
1407 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1408
1409 dataptr[0] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
1410 dataptr[1] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
1411 dataptr[2] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
1412 dataptr[3] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
1413
1414 dataptr += DCTSTRIDE; /* advance pointer to next row */
1415 }
1416
1417 /* Pass 2: process columns. */
1418 /* Note that we must descale the results by a factor of 8 == 2**3, */
1419 /* and also undo the PASS1_BITS scaling. */
1420
1421 dataptr = data;
1422 for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
1423 /* Columns of zeroes can be exploited in the same way as we did with rows.
1424 * However, the row calculation has created many nonzero AC terms, so the
1425 * simplification applies less often (typically 5% to 10% of the time).
1426 * On machines with very fast multiplication, it's possible that the
1427 * test takes more time than it's worth. In that case this section
1428 * may be commented out.
1429 */
1430
1431 d0 = dataptr[DCTSTRIDE*0];
1432 d2 = dataptr[DCTSTRIDE*1];
1433 d4 = dataptr[DCTSTRIDE*2];
1434 d6 = dataptr[DCTSTRIDE*3];
1435
1436 /* Even part: reverse the even part of the forward DCT. */
1437 /* The rotator is sqrt(2)*c(-6). */
1438 if (d6) {
1439 if (d4) {
1440 if (d2) {
1441 if (d0) {
1442 /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1443 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1444 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1445 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1446
1447 tmp0 = (d0 + d4) << CONST_BITS;
1448 tmp1 = (d0 - d4) << CONST_BITS;
1449
1450 tmp10 = tmp0 + tmp3;
1451 tmp13 = tmp0 - tmp3;
1452 tmp11 = tmp1 + tmp2;
1453 tmp12 = tmp1 - tmp2;
1454 } else {
1455 /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
1456 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1457 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1458 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1459
1460 tmp0 = d4 << CONST_BITS;
1461
1462 tmp10 = tmp0 + tmp3;
1463 tmp13 = tmp0 - tmp3;
1464 tmp11 = tmp2 - tmp0;
1465 tmp12 = -(tmp0 + tmp2);
1466 }
1467 } else {
1468 if (d0) {
1469 /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1470 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1471 tmp3 = MULTIPLY(d6, FIX_0_541196100);
1472
1473 tmp0 = (d0 + d4) << CONST_BITS;
1474 tmp1 = (d0 - d4) << CONST_BITS;
1475
1476 tmp10 = tmp0 + tmp3;
1477 tmp13 = tmp0 - tmp3;
1478 tmp11 = tmp1 + tmp2;
1479 tmp12 = tmp1 - tmp2;
1480 } else {
1481 /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
1482 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1483 tmp3 = MULTIPLY(d6, FIX_0_541196100);
1484
1485 tmp0 = d4 << CONST_BITS;
1486
1487 tmp10 = tmp0 + tmp3;
1488 tmp13 = tmp0 - tmp3;
1489 tmp11 = tmp2 - tmp0;
1490 tmp12 = -(tmp0 + tmp2);
1491 }
1492 }
1493 } else {
1494 if (d2) {
1495 if (d0) {
1496 /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
1497 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1498 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1499 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1500
1501 tmp0 = d0 << CONST_BITS;
1502
1503 tmp10 = tmp0 + tmp3;
1504 tmp13 = tmp0 - tmp3;
1505 tmp11 = tmp0 + tmp2;
1506 tmp12 = tmp0 - tmp2;
1507 } else {
1508 /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
1509 z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1510 tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1511 tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1512
1513 tmp10 = tmp3;
1514 tmp13 = -tmp3;
1515 tmp11 = tmp2;
1516 tmp12 = -tmp2;
1517 }
1518 } else {
1519 if (d0) {
1520 /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
1521 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1522 tmp3 = MULTIPLY(d6, FIX_0_541196100);
1523
1524 tmp0 = d0 << CONST_BITS;
1525
1526 tmp10 = tmp0 + tmp3;
1527 tmp13 = tmp0 - tmp3;
1528 tmp11 = tmp0 + tmp2;
1529 tmp12 = tmp0 - tmp2;
1530 } else {
1531 /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
1532 tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1533 tmp3 = MULTIPLY(d6, FIX_0_541196100);
1534
1535 tmp10 = tmp3;
1536 tmp13 = -tmp3;
1537 tmp11 = tmp2;
1538 tmp12 = -tmp2;
1539 }
1540 }
1541 }
1542 } else {
1543 if (d4) {
1544 if (d2) {
1545 if (d0) {
1546 /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1547 tmp2 = MULTIPLY(d2, FIX_0_541196100);
1548 tmp3 = MULTIPLY(d2, FIX_1_306562965);
1549
1550 tmp0 = (d0 + d4) << CONST_BITS;
1551 tmp1 = (d0 - d4) << CONST_BITS;
1552
1553 tmp10 = tmp0 + tmp3;
1554 tmp13 = tmp0 - tmp3;
1555 tmp11 = tmp1 + tmp2;
1556 tmp12 = tmp1 - tmp2;
1557 } else {
1558 /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
1559 tmp2 = MULTIPLY(d2, FIX_0_541196100);
1560 tmp3 = MULTIPLY(d2, FIX_1_306562965);
1561
1562 tmp0 = d4 << CONST_BITS;
1563
1564 tmp10 = tmp0 + tmp3;
1565 tmp13 = tmp0 - tmp3;
1566 tmp11 = tmp2 - tmp0;
1567 tmp12 = -(tmp0 + tmp2);
1568 }
1569 } else {
1570 if (d0) {
1571 /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1572 tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1573 tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1574 } else {
1575 /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
1576 tmp10 = tmp13 = d4 << CONST_BITS;
1577 tmp11 = tmp12 = -tmp10;
1578 }
1579 }
1580 } else {
1581 if (d2) {
1582 if (d0) {
1583 /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
1584 tmp2 = MULTIPLY(d2, FIX_0_541196100);
1585 tmp3 = MULTIPLY(d2, FIX_1_306562965);
1586
1587 tmp0 = d0 << CONST_BITS;
1588
1589 tmp10 = tmp0 + tmp3;
1590 tmp13 = tmp0 - tmp3;
1591 tmp11 = tmp0 + tmp2;
1592 tmp12 = tmp0 - tmp2;
1593 } else {
1594 /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
1595 tmp2 = MULTIPLY(d2, FIX_0_541196100);
1596 tmp3 = MULTIPLY(d2, FIX_1_306562965);
1597
1598 tmp10 = tmp3;
1599 tmp13 = -tmp3;
1600 tmp11 = tmp2;
1601 tmp12 = -tmp2;
1602 }
1603 } else {
1604 if (d0) {
1605 /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
1606 tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
1607 } else {
1608 /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
1609 tmp10 = tmp13 = tmp11 = tmp12 = 0;
1610 }
1611 }
1612 }
1613 }
1614
1615 /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1616
1617 dataptr[DCTSTRIDE*0] = (DCTELEM) DESCALE(tmp10,
1618 CONST_BITS+PASS1_BITS+3);
1619 dataptr[DCTSTRIDE*1] = (DCTELEM) DESCALE(tmp11,
1620 CONST_BITS+PASS1_BITS+3);
1621 dataptr[DCTSTRIDE*2] = (DCTELEM) DESCALE(tmp12,
1622 CONST_BITS+PASS1_BITS+3);
1623 dataptr[DCTSTRIDE*3] = (DCTELEM) DESCALE(tmp13,
1624 CONST_BITS+PASS1_BITS+3);
1625
1626 dataptr++; /* advance pointer to next column */
1627 }
1628}
1629
1630
cd4af68a
ZK
1631#undef FIX
1632#undef CONST_BITS