dct-test: Move cpu_flags variable out of global scope
[libav.git] / libavcodec / dct-test.c
CommitLineData
04d7f601
DB
1/*
2 * (c) 2001 Fabrice Bellard
3ac35bdb 3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
04d7f601 4 *
2912e87a 5 * This file is part of Libav.
b78e7197 6 *
2912e87a 7 * Libav is free software; you can redistribute it and/or
04d7f601
DB
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
b78e7197 10 * version 2.1 of the License, or (at your option) any later version.
04d7f601 11 *
2912e87a 12 * Libav is distributed in the hope that it will be useful,
04d7f601
DB
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
2912e87a 18 * License along with Libav; if not, write to the Free Software
04d7f601
DB
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
983e3246 22/**
ba87f080 23 * @file
94f694a4 24 * DCT test (c) 2001 Fabrice Bellard
983e3246
MN
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
667fb97a 28#include "config.h"
de6d9b64
FB
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
667fb97a 32#if HAVE_UNISTD_H
de6d9b64 33#include <unistd.h>
667fb97a 34#endif
12807c8d 35#include <math.h>
de6d9b64 36
c6c98d08 37#include "libavutil/cpu.h"
ae32e509 38#include "libavutil/common.h"
294eaa26 39#include "libavutil/lfg.h"
980f81d9 40#include "libavutil/time.h"
de6d9b64 41
5d3d39c7 42#include "dct.h"
86748dbc 43#include "simple_idct.h"
10ac3618 44#include "aandcttab.h"
65e4c8c9 45#include "faandct.h"
6f08c541 46#include "faanidct.h"
a6493a8f 47#include "x86/idct_xvid.h"
6a813295 48#include "dctref.h"
9e1586fc 49
3ac35bdb 50// ALTIVEC
88bd7fdc 51void ff_fdct_altivec(int16_t *block);
3ac35bdb 52
479044ce 53// ARM
88bd7fdc
DB
54void ff_j_rev_dct_arm(int16_t *data);
55void ff_simple_idct_arm(int16_t *data);
56void ff_simple_idct_armv5te(int16_t *data);
57void ff_simple_idct_armv6(int16_t *data);
58void ff_simple_idct_neon(int16_t *data);
3ac35bdb
MH
59
60struct algo {
36fa9ef3 61 const char *name;
88bd7fdc 62 void (*func)(int16_t *block);
36fa9ef3
MR
63 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
64 SSE2_PERM, PARTTRANS_PERM } format;
65 int mm_support;
dbf396d4 66 int nonspec;
3ac35bdb
MH
67};
68
4b357756 69static const struct algo fdct_tab[] = {
74965f26 70 { "REF-DBL", ff_ref_fdct, NO_PERM },
856c8e0a 71 { "FAAN", ff_faandct, NO_PERM },
3e2efacd 72 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
0a72533e 73 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
3ac35bdb 74
17337f54 75#if HAVE_MMX_INLINE
74965f26 76 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
0b8b2ae5
DB
77#endif
78#if HAVE_MMXEXT_INLINE
d8eda370 79 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
0b8b2ae5
DB
80#endif
81#if HAVE_SSE2_INLINE
74965f26 82 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
94254fc0 83#endif
3ac35bdb 84
4b357756 85#if HAVE_ALTIVEC
07333750 86 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
4b357756
MR
87#endif
88
4b357756
MR
89 { 0 }
90};
91
92static const struct algo idct_tab[] = {
74965f26
MR
93 { "FAANI", ff_faanidct, NO_PERM },
94 { "REF-DBL", ff_ref_idct, NO_PERM },
c8e1b2fb 95 { "INT", ff_j_rev_dct, MMX_PERM },
e7a972e1 96 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
4b357756 97
17337f54 98#if HAVE_MMX_INLINE
74965f26
MR
99 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
100 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
0b8b2ae5
DB
101#endif
102#if HAVE_MMXEXT_INLINE
d8eda370 103 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
0b8b2ae5
DB
104#endif
105#if HAVE_SSE2_INLINE
74965f26 106 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
3ac35bdb
MH
107#endif
108
b250f9c6 109#if ARCH_ARM
74965f26
MR
110 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
111 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
4b357756 112#endif
b250f9c6 113#if HAVE_ARMV5TE
c29d49c1 114 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
479044ce 115#endif
b250f9c6 116#if HAVE_ARMV6
c29d49c1 117 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
479044ce 118#endif
1e9265cd 119#if HAVE_NEON && ARCH_ARM
c29d49c1 120 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
479044ce 121#endif
479044ce 122
36fa9ef3 123 { 0 }
3ac35bdb
MH
124};
125
de6d9b64 126#define AANSCALE_BITS 12
de6d9b64 127
de6d9b64
FB
128#define NB_ITS 20000
129#define NB_ITS_SPEED 50000
130
9e1586fc
FB
131static short idct_mmx_perm[64];
132
36fa9ef3
MR
133static short idct_simple_mmx_perm[64] = {
134 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
135 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
136 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
137 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
138 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
139 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
140 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
141 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
86748dbc
MN
142};
143
36fa9ef3 144static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
ad246860 145
504ffed1 146static void idct_mmx_init(void)
9e1586fc
FB
147{
148 int i;
149
150 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
151 for (i = 0; i < 64; i++) {
bb270c08 152 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
9e1586fc
FB
153 }
154}
155
88bd7fdc
DB
156DECLARE_ALIGNED(16, static int16_t, block)[64];
157DECLARE_ALIGNED(8, static int16_t, block1)[64];
9e1586fc 158
88bd7fdc 159static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
ae2e8971
MR
160{
161 int i, j;
162
163 memset(block, 0, 64 * sizeof(*block));
164
165 switch (test) {
166 case 0:
167 for (i = 0; i < 64; i++)
168 block[i] = (av_lfg_get(prng) % 512) - 256;
169 if (is_idct) {
170 ff_ref_fdct(block);
171 for (i = 0; i < 64; i++)
172 block[i] >>= 3;
173 }
174 break;
175 case 1:
176 j = av_lfg_get(prng) % 10 + 1;
177 for (i = 0; i < j; i++)
178 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
179 break;
180 case 2:
181 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
182 block[63] = (block[0] & 1) ^ 1;
183 break;
184 }
185}
186
88bd7fdc 187static void permute(int16_t dst[64], const int16_t src[64], int perm)
ae2e8971
MR
188{
189 int i;
190
191 if (perm == MMX_PERM) {
192 for (i = 0; i < 64; i++)
193 dst[idct_mmx_perm[i]] = src[i];
194 } else if (perm == MMX_SIMPLE_PERM) {
195 for (i = 0; i < 64; i++)
196 dst[idct_simple_mmx_perm[i]] = src[i];
197 } else if (perm == SSE2_PERM) {
198 for (i = 0; i < 64; i++)
199 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
200 } else if (perm == PARTTRANS_PERM) {
201 for (i = 0; i < 64; i++)
202 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
203 } else {
204 for (i = 0; i < 64; i++)
205 dst[i] = src[i];
206 }
207}
208
dbf396d4 209static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
de6d9b64 210{
88bd7fdc 211 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
de6d9b64 212 int it, i, scale;
de6d9b64 213 int err_inf, v;
dbf396d4 214 int64_t err2, ti, ti1, it1, err_sum = 0;
36fa9ef3
MR
215 int64_t sysErr[64], sysErrMax = 0;
216 int maxout = 0;
217 int blockSumErrMax = 0, blockSumErr;
64bde197 218 AVLFG prng;
dbf396d4
MR
219 double omse, ome;
220 int spec_err;
de6d9b64 221
64bde197 222 av_lfg_init(&prng, 1);
de6d9b64
FB
223
224 err_inf = 0;
225 err2 = 0;
36fa9ef3
MR
226 for (i = 0; i < 64; i++)
227 sysErr[i] = 0;
228 for (it = 0; it < NB_ITS; it++) {
ae2e8971
MR
229 init_block(block1, test, is_idct, &prng);
230 permute(block, block1, dct->format);
9e1586fc 231
4f905a65 232 dct->func(block);
db7d8fb4 233 emms_c();
9e1586fc 234
4f905a65 235 if (dct->format == SCALE_PERM) {
36fa9ef3
MR
236 for (i = 0; i < 64; i++) {
237 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
238 block[i] = (block[i] * scale) >> AANSCALE_BITS;
86748dbc
MN
239 }
240 }
241
74965f26 242 ref(block1);
de6d9b64 243
36fa9ef3
MR
244 blockSumErr = 0;
245 for (i = 0; i < 64; i++) {
dbf396d4
MR
246 int err = block[i] - block1[i];
247 err_sum += err;
248 v = abs(err);
de6d9b64
FB
249 if (v > err_inf)
250 err_inf = v;
251 err2 += v * v;
bb270c08
DB
252 sysErr[i] += block[i] - block1[i];
253 blockSumErr += v;
36fa9ef3
MR
254 if (abs(block[i]) > maxout)
255 maxout = abs(block[i]);
de6d9b64 256 }
36fa9ef3
MR
257 if (blockSumErrMax < blockSumErr)
258 blockSumErrMax = blockSumErr;
86748dbc 259 }
36fa9ef3
MR
260 for (i = 0; i < 64; i++)
261 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
115329f1 262
36fa9ef3
MR
263 for (i = 0; i < 64; i++) {
264 if (i % 8 == 0)
265 printf("\n");
266 printf("%7d ", (int) sysErr[i]);
de6d9b64 267 }
86748dbc 268 printf("\n");
115329f1 269
dbf396d4
MR
270 omse = (double) err2 / NB_ITS / 64;
271 ome = (double) err_sum / NB_ITS / 64;
272
273 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
274
275 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
cf2b4f88 276 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
dbf396d4 277 omse, ome, (double) sysErrMax / NB_ITS,
36fa9ef3 278 maxout, blockSumErrMax);
e6ff0648 279
dbf396d4
MR
280 if (spec_err && !dct->nonspec)
281 return 1;
282
7fd2c138 283 if (!speed)
dbf396d4 284 return 0;
7fd2c138 285
de6d9b64 286 /* speed test */
ae2e8971
MR
287 init_block(block, test, is_idct, &prng);
288 permute(block1, block, dct->format);
9e1586fc 289
980f81d9 290 ti = av_gettime();
de6d9b64
FB
291 it1 = 0;
292 do {
36fa9ef3 293 for (it = 0; it < NB_ITS_SPEED; it++) {
ae2e8971 294 memcpy(block, block1, sizeof(block));
4f905a65 295 dct->func(block);
de6d9b64
FB
296 }
297 it1 += NB_ITS_SPEED;
980f81d9 298 ti1 = av_gettime() - ti;
de6d9b64 299 } while (ti1 < 1000000);
db7d8fb4 300 emms_c();
de6d9b64 301
cf2b4f88 302 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
36fa9ef3 303 (double) it1 * 1000.0 / (double) ti1);
dbf396d4
MR
304
305 return 0;
de6d9b64
FB
306}
307
c6727809
MR
308DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
309DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
a46a3ce4 310
504ffed1 311static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
a46a3ce4
FB
312{
313 static int init;
314 static double c8[8][8];
315 static double c4[4][4];
316 double block1[64], block2[64], block3[64];
317 double s, sum, v;
318 int i, j, k;
319
320 if (!init) {
321 init = 1;
322
36fa9ef3 323 for (i = 0; i < 8; i++) {
a46a3ce4 324 sum = 0;
36fa9ef3
MR
325 for (j = 0; j < 8; j++) {
326 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
a46a3ce4
FB
327 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
328 sum += c8[i][j] * c8[i][j];
329 }
330 }
115329f1 331
36fa9ef3 332 for (i = 0; i < 4; i++) {
a46a3ce4 333 sum = 0;
36fa9ef3
MR
334 for (j = 0; j < 4; j++) {
335 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
a46a3ce4
FB
336 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
337 sum += c4[i][j] * c4[i][j];
338 }
339 }
340 }
341
342 /* butterfly */
652f0197 343 s = 0.5 * sqrt(2.0);
36fa9ef3
MR
344 for (i = 0; i < 4; i++) {
345 for (j = 0; j < 8; j++) {
346 block1[8 * (2 * i) + j] =
347 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
348 block1[8 * (2 * i + 1) + j] =
349 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
a46a3ce4
FB
350 }
351 }
352
353 /* idct8 on lines */
36fa9ef3
MR
354 for (i = 0; i < 8; i++) {
355 for (j = 0; j < 8; j++) {
a46a3ce4 356 sum = 0;
36fa9ef3
MR
357 for (k = 0; k < 8; k++)
358 sum += c8[k][j] * block1[8 * i + k];
359 block2[8 * i + j] = sum;
a46a3ce4
FB
360 }
361 }
362
363 /* idct4 */
36fa9ef3
MR
364 for (i = 0; i < 8; i++) {
365 for (j = 0; j < 4; j++) {
a46a3ce4
FB
366 /* top */
367 sum = 0;
36fa9ef3
MR
368 for (k = 0; k < 4; k++)
369 sum += c4[k][j] * block2[8 * (2 * k) + i];
370 block3[8 * (2 * j) + i] = sum;
a46a3ce4
FB
371
372 /* bottom */
373 sum = 0;
36fa9ef3
MR
374 for (k = 0; k < 4; k++)
375 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
376 block3[8 * (2 * j + 1) + i] = sum;
a46a3ce4
FB
377 }
378 }
379
380 /* clamp and store the result */
36fa9ef3
MR
381 for (i = 0; i < 8; i++) {
382 for (j = 0; j < 8; j++) {
383 v = block3[8 * i + j];
384 if (v < 0) v = 0;
385 else if (v > 255) v = 255;
386 dest[i * linesize + j] = (int) rint(v);
a46a3ce4
FB
387 }
388 }
389}
390
504ffed1 391static void idct248_error(const char *name,
36fa9ef3 392 void (*idct248_put)(uint8_t *dest, int line_size,
7fd2c138
MR
393 int16_t *block),
394 int speed)
a46a3ce4
FB
395{
396 int it, i, it1, ti, ti1, err_max, v;
64bde197 397 AVLFG prng;
294eaa26 398
64bde197 399 av_lfg_init(&prng, 1);
115329f1 400
a46a3ce4
FB
401 /* just one test to see if code is correct (precision is less
402 important here) */
403 err_max = 0;
36fa9ef3 404 for (it = 0; it < NB_ITS; it++) {
652f0197 405 /* XXX: use forward transform to generate values */
36fa9ef3 406 for (i = 0; i < 64; i++)
64bde197 407 block1[i] = av_lfg_get(&prng) % 256 - 128;
652f0197
FB
408 block1[0] += 1024;
409
36fa9ef3
MR
410 for (i = 0; i < 64; i++)
411 block[i] = block1[i];
a46a3ce4 412 idct248_ref(img_dest1, 8, block);
115329f1 413
36fa9ef3
MR
414 for (i = 0; i < 64; i++)
415 block[i] = block1[i];
652f0197 416 idct248_put(img_dest, 8, block);
115329f1 417
36fa9ef3
MR
418 for (i = 0; i < 64; i++) {
419 v = abs((int) img_dest[i] - (int) img_dest1[i]);
652f0197
FB
420 if (v == 255)
421 printf("%d %d\n", img_dest[i], img_dest1[i]);
422 if (v > err_max)
423 err_max = v;
424 }
a46a3ce4 425 }
36fa9ef3 426 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
a46a3ce4 427
7fd2c138
MR
428 if (!speed)
429 return;
430
980f81d9 431 ti = av_gettime();
a46a3ce4
FB
432 it1 = 0;
433 do {
36fa9ef3
MR
434 for (it = 0; it < NB_ITS_SPEED; it++) {
435 for (i = 0; i < 64; i++)
436 block[i] = block1[i];
a46a3ce4
FB
437 idct248_put(img_dest, 8, block);
438 }
439 it1 += NB_ITS_SPEED;
980f81d9 440 ti1 = av_gettime() - ti;
a46a3ce4 441 } while (ti1 < 1000000);
db7d8fb4 442 emms_c();
a46a3ce4 443
36fa9ef3
MR
444 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
445 (double) it1 * 1000.0 / (double) ti1);
a46a3ce4
FB
446}
447
504ffed1 448static void help(void)
9e1586fc 449{
86748dbc
MN
450 printf("dct-test [-i] [<test-number>]\n"
451 "test-number 0 -> test with random matrixes\n"
452 " 1 -> test with random sparse matrixes\n"
453 " 2 -> do 3. test from mpeg4 std\n"
a46a3ce4 454 "-i test IDCT implementations\n"
7fd2c138
MR
455 "-4 test IDCT248 implementations\n"
456 "-t speed test\n");
9e1586fc
FB
457}
458
667fb97a
RB
459#if !HAVE_GETOPT
460#include "compat/getopt.c"
461#endif
462
de6d9b64
FB
463int main(int argc, char **argv)
464{
a46a3ce4 465 int test_idct = 0, test_248_dct = 0;
36fa9ef3
MR
466 int c, i;
467 int test = 1;
7fd2c138 468 int speed = 0;
dbf396d4 469 int err = 0;
36fa9ef3 470
0de74546 471 ff_ref_dct_init();
9e1586fc 472 idct_mmx_init();
f67a10cd 473
36fa9ef3 474 for (;;) {
7fd2c138 475 c = getopt(argc, argv, "ih4t");
9e1586fc
FB
476 if (c == -1)
477 break;
36fa9ef3 478 switch (c) {
9e1586fc
FB
479 case 'i':
480 test_idct = 1;
481 break;
a46a3ce4
FB
482 case '4':
483 test_248_dct = 1;
484 break;
7fd2c138
MR
485 case 't':
486 speed = 1;
487 break;
36fa9ef3 488 default:
9e1586fc
FB
489 case 'h':
490 help();
c6bdc908 491 return 0;
9e1586fc
FB
492 }
493 }
115329f1 494
36fa9ef3
MR
495 if (optind < argc)
496 test = atoi(argv[optind]);
115329f1 497
f36b3902 498 printf("Libav DCT/IDCT test\n");
9e1586fc 499
a46a3ce4 500 if (test_248_dct) {
7fd2c138 501 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
9e1586fc 502 } else {
cb44b21d 503 const int cpu_flags = av_get_cpu_flags();
4b357756 504 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
36fa9ef3 505 for (i = 0; algos[i].name; i++)
4b357756 506 if (!(~cpu_flags & algos[i].mm_support)) {
dbf396d4 507 err |= dct_error(&algos[i], test, test_idct, speed);
36fa9ef3 508 }
9e1586fc 509 }
dbf396d4 510
5331d2b9
DB
511 if (err)
512 printf("Error: %d.\n", err);
513
514 return !!err;
de6d9b64 515}