dct-test: Skip indirection for MMX IDCT permutation
[libav.git] / libavcodec / dct-test.c
CommitLineData
04d7f601
DB
1/*
2 * (c) 2001 Fabrice Bellard
3ac35bdb 3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
04d7f601 4 *
2912e87a 5 * This file is part of Libav.
b78e7197 6 *
2912e87a 7 * Libav is free software; you can redistribute it and/or
04d7f601
DB
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
b78e7197 10 * version 2.1 of the License, or (at your option) any later version.
04d7f601 11 *
2912e87a 12 * Libav is distributed in the hope that it will be useful,
04d7f601
DB
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
2912e87a 18 * License along with Libav; if not, write to the Free Software
04d7f601
DB
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
983e3246 22/**
ba87f080 23 * @file
94f694a4 24 * DCT test (c) 2001 Fabrice Bellard
983e3246
MN
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
667fb97a 28#include "config.h"
de6d9b64
FB
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
667fb97a 32#if HAVE_UNISTD_H
de6d9b64 33#include <unistd.h>
667fb97a 34#endif
12807c8d 35#include <math.h>
de6d9b64 36
c6c98d08 37#include "libavutil/cpu.h"
ae32e509 38#include "libavutil/common.h"
294eaa26 39#include "libavutil/lfg.h"
980f81d9 40#include "libavutil/time.h"
de6d9b64 41
5d3d39c7 42#include "dct.h"
86748dbc 43#include "simple_idct.h"
10ac3618 44#include "aandcttab.h"
65e4c8c9 45#include "faandct.h"
6f08c541 46#include "faanidct.h"
a6493a8f 47#include "x86/idct_xvid.h"
6a813295 48#include "dctref.h"
9e1586fc 49
3ac35bdb 50// ALTIVEC
88bd7fdc 51void ff_fdct_altivec(int16_t *block);
3ac35bdb 52
479044ce 53// ARM
88bd7fdc
DB
54void ff_j_rev_dct_arm(int16_t *data);
55void ff_simple_idct_arm(int16_t *data);
56void ff_simple_idct_armv5te(int16_t *data);
57void ff_simple_idct_armv6(int16_t *data);
58void ff_simple_idct_neon(int16_t *data);
3ac35bdb
MH
59
60struct algo {
36fa9ef3 61 const char *name;
88bd7fdc 62 void (*func)(int16_t *block);
36fa9ef3
MR
63 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
64 SSE2_PERM, PARTTRANS_PERM } format;
746ad4e0 65 int cpu_flag;
dbf396d4 66 int nonspec;
3ac35bdb
MH
67};
68
4b357756 69static const struct algo fdct_tab[] = {
74965f26 70 { "REF-DBL", ff_ref_fdct, NO_PERM },
856c8e0a 71 { "FAAN", ff_faandct, NO_PERM },
3e2efacd 72 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
0a72533e 73 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
3ac35bdb 74
17337f54 75#if HAVE_MMX_INLINE
74965f26 76 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
0b8b2ae5
DB
77#endif
78#if HAVE_MMXEXT_INLINE
d8eda370 79 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
0b8b2ae5
DB
80#endif
81#if HAVE_SSE2_INLINE
74965f26 82 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
94254fc0 83#endif
3ac35bdb 84
4b357756 85#if HAVE_ALTIVEC
07333750 86 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
4b357756
MR
87#endif
88
4b357756
MR
89 { 0 }
90};
91
92static const struct algo idct_tab[] = {
74965f26
MR
93 { "FAANI", ff_faanidct, NO_PERM },
94 { "REF-DBL", ff_ref_idct, NO_PERM },
c8e1b2fb 95 { "INT", ff_j_rev_dct, MMX_PERM },
e7a972e1 96 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
4b357756 97
17337f54 98#if HAVE_MMX_INLINE
74965f26
MR
99 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
100 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
0b8b2ae5
DB
101#endif
102#if HAVE_MMXEXT_INLINE
d8eda370 103 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
0b8b2ae5
DB
104#endif
105#if HAVE_SSE2_INLINE
74965f26 106 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
3ac35bdb
MH
107#endif
108
b250f9c6 109#if ARCH_ARM
74965f26
MR
110 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
111 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
4b357756 112#endif
b250f9c6 113#if HAVE_ARMV5TE
c29d49c1 114 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
479044ce 115#endif
b250f9c6 116#if HAVE_ARMV6
c29d49c1 117 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
479044ce 118#endif
1e9265cd 119#if HAVE_NEON && ARCH_ARM
c29d49c1 120 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
479044ce 121#endif
479044ce 122
36fa9ef3 123 { 0 }
3ac35bdb
MH
124};
125
de6d9b64 126#define AANSCALE_BITS 12
de6d9b64 127
de6d9b64
FB
128#define NB_ITS 20000
129#define NB_ITS_SPEED 50000
130
36fa9ef3
MR
131static short idct_simple_mmx_perm[64] = {
132 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
133 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
134 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
135 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
136 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
137 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
138 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
139 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
86748dbc
MN
140};
141
36fa9ef3 142static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
ad246860 143
88bd7fdc
DB
144DECLARE_ALIGNED(16, static int16_t, block)[64];
145DECLARE_ALIGNED(8, static int16_t, block1)[64];
9e1586fc 146
88bd7fdc 147static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
ae2e8971
MR
148{
149 int i, j;
150
151 memset(block, 0, 64 * sizeof(*block));
152
153 switch (test) {
154 case 0:
155 for (i = 0; i < 64; i++)
156 block[i] = (av_lfg_get(prng) % 512) - 256;
157 if (is_idct) {
158 ff_ref_fdct(block);
159 for (i = 0; i < 64; i++)
160 block[i] >>= 3;
161 }
162 break;
163 case 1:
164 j = av_lfg_get(prng) % 10 + 1;
165 for (i = 0; i < j; i++)
166 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
167 break;
168 case 2:
169 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
170 block[63] = (block[0] & 1) ^ 1;
171 break;
172 }
173}
174
88bd7fdc 175static void permute(int16_t dst[64], const int16_t src[64], int perm)
ae2e8971
MR
176{
177 int i;
178
179 if (perm == MMX_PERM) {
180 for (i = 0; i < 64; i++)
913fa85a 181 dst[(i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2)] = src[i];
ae2e8971
MR
182 } else if (perm == MMX_SIMPLE_PERM) {
183 for (i = 0; i < 64; i++)
184 dst[idct_simple_mmx_perm[i]] = src[i];
185 } else if (perm == SSE2_PERM) {
186 for (i = 0; i < 64; i++)
187 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
188 } else if (perm == PARTTRANS_PERM) {
189 for (i = 0; i < 64; i++)
190 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
191 } else {
192 for (i = 0; i < 64; i++)
193 dst[i] = src[i];
194 }
195}
196
dbf396d4 197static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
de6d9b64 198{
88bd7fdc 199 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
de6d9b64 200 int it, i, scale;
de6d9b64 201 int err_inf, v;
dbf396d4 202 int64_t err2, ti, ti1, it1, err_sum = 0;
36fa9ef3
MR
203 int64_t sysErr[64], sysErrMax = 0;
204 int maxout = 0;
205 int blockSumErrMax = 0, blockSumErr;
64bde197 206 AVLFG prng;
dbf396d4
MR
207 double omse, ome;
208 int spec_err;
de6d9b64 209
64bde197 210 av_lfg_init(&prng, 1);
de6d9b64
FB
211
212 err_inf = 0;
213 err2 = 0;
36fa9ef3
MR
214 for (i = 0; i < 64; i++)
215 sysErr[i] = 0;
216 for (it = 0; it < NB_ITS; it++) {
ae2e8971
MR
217 init_block(block1, test, is_idct, &prng);
218 permute(block, block1, dct->format);
9e1586fc 219
4f905a65 220 dct->func(block);
db7d8fb4 221 emms_c();
9e1586fc 222
4f905a65 223 if (dct->format == SCALE_PERM) {
36fa9ef3
MR
224 for (i = 0; i < 64; i++) {
225 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
226 block[i] = (block[i] * scale) >> AANSCALE_BITS;
86748dbc
MN
227 }
228 }
229
74965f26 230 ref(block1);
de6d9b64 231
36fa9ef3
MR
232 blockSumErr = 0;
233 for (i = 0; i < 64; i++) {
dbf396d4
MR
234 int err = block[i] - block1[i];
235 err_sum += err;
236 v = abs(err);
de6d9b64
FB
237 if (v > err_inf)
238 err_inf = v;
239 err2 += v * v;
bb270c08
DB
240 sysErr[i] += block[i] - block1[i];
241 blockSumErr += v;
36fa9ef3
MR
242 if (abs(block[i]) > maxout)
243 maxout = abs(block[i]);
de6d9b64 244 }
36fa9ef3
MR
245 if (blockSumErrMax < blockSumErr)
246 blockSumErrMax = blockSumErr;
86748dbc 247 }
36fa9ef3
MR
248 for (i = 0; i < 64; i++)
249 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
115329f1 250
36fa9ef3
MR
251 for (i = 0; i < 64; i++) {
252 if (i % 8 == 0)
253 printf("\n");
254 printf("%7d ", (int) sysErr[i]);
de6d9b64 255 }
86748dbc 256 printf("\n");
115329f1 257
dbf396d4
MR
258 omse = (double) err2 / NB_ITS / 64;
259 ome = (double) err_sum / NB_ITS / 64;
260
261 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
262
263 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
cf2b4f88 264 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
dbf396d4 265 omse, ome, (double) sysErrMax / NB_ITS,
36fa9ef3 266 maxout, blockSumErrMax);
e6ff0648 267
dbf396d4
MR
268 if (spec_err && !dct->nonspec)
269 return 1;
270
7fd2c138 271 if (!speed)
dbf396d4 272 return 0;
7fd2c138 273
de6d9b64 274 /* speed test */
ae2e8971
MR
275 init_block(block, test, is_idct, &prng);
276 permute(block1, block, dct->format);
9e1586fc 277
980f81d9 278 ti = av_gettime();
de6d9b64
FB
279 it1 = 0;
280 do {
36fa9ef3 281 for (it = 0; it < NB_ITS_SPEED; it++) {
ae2e8971 282 memcpy(block, block1, sizeof(block));
4f905a65 283 dct->func(block);
de6d9b64
FB
284 }
285 it1 += NB_ITS_SPEED;
980f81d9 286 ti1 = av_gettime() - ti;
de6d9b64 287 } while (ti1 < 1000000);
db7d8fb4 288 emms_c();
de6d9b64 289
cf2b4f88 290 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
36fa9ef3 291 (double) it1 * 1000.0 / (double) ti1);
dbf396d4
MR
292
293 return 0;
de6d9b64
FB
294}
295
c6727809
MR
296DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
297DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
a46a3ce4 298
504ffed1 299static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
a46a3ce4
FB
300{
301 static int init;
302 static double c8[8][8];
303 static double c4[4][4];
304 double block1[64], block2[64], block3[64];
305 double s, sum, v;
306 int i, j, k;
307
308 if (!init) {
309 init = 1;
310
36fa9ef3 311 for (i = 0; i < 8; i++) {
a46a3ce4 312 sum = 0;
36fa9ef3
MR
313 for (j = 0; j < 8; j++) {
314 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
a46a3ce4
FB
315 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
316 sum += c8[i][j] * c8[i][j];
317 }
318 }
115329f1 319
36fa9ef3 320 for (i = 0; i < 4; i++) {
a46a3ce4 321 sum = 0;
36fa9ef3
MR
322 for (j = 0; j < 4; j++) {
323 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
a46a3ce4
FB
324 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
325 sum += c4[i][j] * c4[i][j];
326 }
327 }
328 }
329
330 /* butterfly */
652f0197 331 s = 0.5 * sqrt(2.0);
36fa9ef3
MR
332 for (i = 0; i < 4; i++) {
333 for (j = 0; j < 8; j++) {
334 block1[8 * (2 * i) + j] =
335 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
336 block1[8 * (2 * i + 1) + j] =
337 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
a46a3ce4
FB
338 }
339 }
340
341 /* idct8 on lines */
36fa9ef3
MR
342 for (i = 0; i < 8; i++) {
343 for (j = 0; j < 8; j++) {
a46a3ce4 344 sum = 0;
36fa9ef3
MR
345 for (k = 0; k < 8; k++)
346 sum += c8[k][j] * block1[8 * i + k];
347 block2[8 * i + j] = sum;
a46a3ce4
FB
348 }
349 }
350
351 /* idct4 */
36fa9ef3
MR
352 for (i = 0; i < 8; i++) {
353 for (j = 0; j < 4; j++) {
a46a3ce4
FB
354 /* top */
355 sum = 0;
36fa9ef3
MR
356 for (k = 0; k < 4; k++)
357 sum += c4[k][j] * block2[8 * (2 * k) + i];
358 block3[8 * (2 * j) + i] = sum;
a46a3ce4
FB
359
360 /* bottom */
361 sum = 0;
36fa9ef3
MR
362 for (k = 0; k < 4; k++)
363 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
364 block3[8 * (2 * j + 1) + i] = sum;
a46a3ce4
FB
365 }
366 }
367
368 /* clamp and store the result */
36fa9ef3
MR
369 for (i = 0; i < 8; i++) {
370 for (j = 0; j < 8; j++) {
371 v = block3[8 * i + j];
372 if (v < 0) v = 0;
373 else if (v > 255) v = 255;
374 dest[i * linesize + j] = (int) rint(v);
a46a3ce4
FB
375 }
376 }
377}
378
504ffed1 379static void idct248_error(const char *name,
36fa9ef3 380 void (*idct248_put)(uint8_t *dest, int line_size,
7fd2c138
MR
381 int16_t *block),
382 int speed)
a46a3ce4
FB
383{
384 int it, i, it1, ti, ti1, err_max, v;
64bde197 385 AVLFG prng;
294eaa26 386
64bde197 387 av_lfg_init(&prng, 1);
115329f1 388
a46a3ce4
FB
389 /* just one test to see if code is correct (precision is less
390 important here) */
391 err_max = 0;
36fa9ef3 392 for (it = 0; it < NB_ITS; it++) {
652f0197 393 /* XXX: use forward transform to generate values */
36fa9ef3 394 for (i = 0; i < 64; i++)
64bde197 395 block1[i] = av_lfg_get(&prng) % 256 - 128;
652f0197
FB
396 block1[0] += 1024;
397
36fa9ef3
MR
398 for (i = 0; i < 64; i++)
399 block[i] = block1[i];
a46a3ce4 400 idct248_ref(img_dest1, 8, block);
115329f1 401
36fa9ef3
MR
402 for (i = 0; i < 64; i++)
403 block[i] = block1[i];
652f0197 404 idct248_put(img_dest, 8, block);
115329f1 405
36fa9ef3
MR
406 for (i = 0; i < 64; i++) {
407 v = abs((int) img_dest[i] - (int) img_dest1[i]);
652f0197
FB
408 if (v == 255)
409 printf("%d %d\n", img_dest[i], img_dest1[i]);
410 if (v > err_max)
411 err_max = v;
412 }
a46a3ce4 413 }
36fa9ef3 414 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
a46a3ce4 415
7fd2c138
MR
416 if (!speed)
417 return;
418
980f81d9 419 ti = av_gettime();
a46a3ce4
FB
420 it1 = 0;
421 do {
36fa9ef3
MR
422 for (it = 0; it < NB_ITS_SPEED; it++) {
423 for (i = 0; i < 64; i++)
424 block[i] = block1[i];
a46a3ce4
FB
425 idct248_put(img_dest, 8, block);
426 }
427 it1 += NB_ITS_SPEED;
980f81d9 428 ti1 = av_gettime() - ti;
a46a3ce4 429 } while (ti1 < 1000000);
db7d8fb4 430 emms_c();
a46a3ce4 431
36fa9ef3
MR
432 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
433 (double) it1 * 1000.0 / (double) ti1);
a46a3ce4
FB
434}
435
504ffed1 436static void help(void)
9e1586fc 437{
86748dbc
MN
438 printf("dct-test [-i] [<test-number>]\n"
439 "test-number 0 -> test with random matrixes\n"
440 " 1 -> test with random sparse matrixes\n"
441 " 2 -> do 3. test from mpeg4 std\n"
a46a3ce4 442 "-i test IDCT implementations\n"
7fd2c138
MR
443 "-4 test IDCT248 implementations\n"
444 "-t speed test\n");
9e1586fc
FB
445}
446
667fb97a
RB
447#if !HAVE_GETOPT
448#include "compat/getopt.c"
449#endif
450
de6d9b64
FB
451int main(int argc, char **argv)
452{
a46a3ce4 453 int test_idct = 0, test_248_dct = 0;
36fa9ef3
MR
454 int c, i;
455 int test = 1;
7fd2c138 456 int speed = 0;
dbf396d4 457 int err = 0;
36fa9ef3 458
0de74546 459 ff_ref_dct_init();
f67a10cd 460
36fa9ef3 461 for (;;) {
7fd2c138 462 c = getopt(argc, argv, "ih4t");
9e1586fc
FB
463 if (c == -1)
464 break;
36fa9ef3 465 switch (c) {
9e1586fc
FB
466 case 'i':
467 test_idct = 1;
468 break;
a46a3ce4
FB
469 case '4':
470 test_248_dct = 1;
471 break;
7fd2c138
MR
472 case 't':
473 speed = 1;
474 break;
36fa9ef3 475 default:
9e1586fc
FB
476 case 'h':
477 help();
c6bdc908 478 return 0;
9e1586fc
FB
479 }
480 }
115329f1 481
36fa9ef3
MR
482 if (optind < argc)
483 test = atoi(argv[optind]);
115329f1 484
f36b3902 485 printf("Libav DCT/IDCT test\n");
9e1586fc 486
a46a3ce4 487 if (test_248_dct) {
7fd2c138 488 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
9e1586fc 489 } else {
cb44b21d 490 const int cpu_flags = av_get_cpu_flags();
4b357756 491 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
36fa9ef3 492 for (i = 0; algos[i].name; i++)
746ad4e0 493 if (!(~cpu_flags & algos[i].cpu_flag)) {
dbf396d4 494 err |= dct_error(&algos[i], test, test_idct, speed);
36fa9ef3 495 }
9e1586fc 496 }
dbf396d4 497
5331d2b9
DB
498 if (err)
499 printf("Error: %d.\n", err);
500
501 return !!err;
de6d9b64 502}