arm: cosmetics: Consistently use lowercase for shift operators
[libav.git] / libavcodec / dct-test.c
CommitLineData
04d7f601
DB
1/*
2 * (c) 2001 Fabrice Bellard
3ac35bdb 3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
04d7f601 4 *
2912e87a 5 * This file is part of Libav.
b78e7197 6 *
2912e87a 7 * Libav is free software; you can redistribute it and/or
04d7f601
DB
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
b78e7197 10 * version 2.1 of the License, or (at your option) any later version.
04d7f601 11 *
2912e87a 12 * Libav is distributed in the hope that it will be useful,
04d7f601
DB
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
2912e87a 18 * License along with Libav; if not, write to the Free Software
04d7f601
DB
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
983e3246 22/**
ba87f080 23 * @file
94f694a4 24 * DCT test (c) 2001 Fabrice Bellard
983e3246
MN
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
667fb97a 28#include "config.h"
de6d9b64
FB
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
667fb97a 32#if HAVE_UNISTD_H
de6d9b64 33#include <unistd.h>
667fb97a 34#endif
12807c8d 35#include <math.h>
de6d9b64 36
c6c98d08 37#include "libavutil/cpu.h"
ae32e509 38#include "libavutil/common.h"
294eaa26 39#include "libavutil/lfg.h"
980f81d9 40#include "libavutil/time.h"
de6d9b64 41
5d3d39c7 42#include "dct.h"
86748dbc 43#include "simple_idct.h"
10ac3618 44#include "aandcttab.h"
65e4c8c9 45#include "faandct.h"
6f08c541 46#include "faanidct.h"
a6493a8f 47#include "x86/idct_xvid.h"
6a813295 48#include "dctref.h"
9e1586fc 49
3ac35bdb 50// ALTIVEC
88bd7fdc 51void ff_fdct_altivec(int16_t *block);
3ac35bdb 52
479044ce 53// ARM
88bd7fdc
DB
54void ff_j_rev_dct_arm(int16_t *data);
55void ff_simple_idct_arm(int16_t *data);
56void ff_simple_idct_armv5te(int16_t *data);
57void ff_simple_idct_armv6(int16_t *data);
58void ff_simple_idct_neon(int16_t *data);
3ac35bdb
MH
59
60struct algo {
36fa9ef3 61 const char *name;
88bd7fdc 62 void (*func)(int16_t *block);
36fa9ef3
MR
63 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
64 SSE2_PERM, PARTTRANS_PERM } format;
65 int mm_support;
dbf396d4 66 int nonspec;
3ac35bdb
MH
67};
68
aadd27cd
MN
69static int cpu_flags;
70
4b357756 71static const struct algo fdct_tab[] = {
74965f26 72 { "REF-DBL", ff_ref_fdct, NO_PERM },
856c8e0a 73 { "FAAN", ff_faandct, NO_PERM },
3e2efacd 74 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
0a72533e 75 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
3ac35bdb 76
17337f54 77#if HAVE_MMX_INLINE
74965f26 78 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
0b8b2ae5
DB
79#endif
80#if HAVE_MMXEXT_INLINE
d8eda370 81 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
0b8b2ae5
DB
82#endif
83#if HAVE_SSE2_INLINE
74965f26 84 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
94254fc0 85#endif
3ac35bdb 86
4b357756 87#if HAVE_ALTIVEC
07333750 88 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
4b357756
MR
89#endif
90
4b357756
MR
91 { 0 }
92};
93
94static const struct algo idct_tab[] = {
74965f26
MR
95 { "FAANI", ff_faanidct, NO_PERM },
96 { "REF-DBL", ff_ref_idct, NO_PERM },
c8e1b2fb 97 { "INT", ff_j_rev_dct, MMX_PERM },
e7a972e1 98 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
4b357756 99
17337f54 100#if HAVE_MMX_INLINE
74965f26
MR
101 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
102 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
0b8b2ae5
DB
103#endif
104#if HAVE_MMXEXT_INLINE
d8eda370 105 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
0b8b2ae5
DB
106#endif
107#if HAVE_SSE2_INLINE
74965f26 108 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
3ac35bdb
MH
109#endif
110
b250f9c6 111#if ARCH_ARM
74965f26
MR
112 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
113 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
4b357756 114#endif
b250f9c6 115#if HAVE_ARMV5TE
c29d49c1 116 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
479044ce 117#endif
b250f9c6 118#if HAVE_ARMV6
c29d49c1 119 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
479044ce 120#endif
1e9265cd 121#if HAVE_NEON && ARCH_ARM
c29d49c1 122 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
479044ce 123#endif
479044ce 124
36fa9ef3 125 { 0 }
3ac35bdb
MH
126};
127
de6d9b64 128#define AANSCALE_BITS 12
de6d9b64 129
de6d9b64
FB
130#define NB_ITS 20000
131#define NB_ITS_SPEED 50000
132
9e1586fc
FB
133static short idct_mmx_perm[64];
134
36fa9ef3
MR
135static short idct_simple_mmx_perm[64] = {
136 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
137 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
138 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
139 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
140 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
141 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
142 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
143 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
86748dbc
MN
144};
145
36fa9ef3 146static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
ad246860 147
504ffed1 148static void idct_mmx_init(void)
9e1586fc
FB
149{
150 int i;
151
152 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
153 for (i = 0; i < 64; i++) {
bb270c08 154 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
9e1586fc
FB
155 }
156}
157
88bd7fdc
DB
158DECLARE_ALIGNED(16, static int16_t, block)[64];
159DECLARE_ALIGNED(8, static int16_t, block1)[64];
9e1586fc 160
88bd7fdc 161static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
ae2e8971
MR
162{
163 int i, j;
164
165 memset(block, 0, 64 * sizeof(*block));
166
167 switch (test) {
168 case 0:
169 for (i = 0; i < 64; i++)
170 block[i] = (av_lfg_get(prng) % 512) - 256;
171 if (is_idct) {
172 ff_ref_fdct(block);
173 for (i = 0; i < 64; i++)
174 block[i] >>= 3;
175 }
176 break;
177 case 1:
178 j = av_lfg_get(prng) % 10 + 1;
179 for (i = 0; i < j; i++)
180 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
181 break;
182 case 2:
183 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
184 block[63] = (block[0] & 1) ^ 1;
185 break;
186 }
187}
188
88bd7fdc 189static void permute(int16_t dst[64], const int16_t src[64], int perm)
ae2e8971
MR
190{
191 int i;
192
193 if (perm == MMX_PERM) {
194 for (i = 0; i < 64; i++)
195 dst[idct_mmx_perm[i]] = src[i];
196 } else if (perm == MMX_SIMPLE_PERM) {
197 for (i = 0; i < 64; i++)
198 dst[idct_simple_mmx_perm[i]] = src[i];
199 } else if (perm == SSE2_PERM) {
200 for (i = 0; i < 64; i++)
201 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
202 } else if (perm == PARTTRANS_PERM) {
203 for (i = 0; i < 64; i++)
204 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
205 } else {
206 for (i = 0; i < 64; i++)
207 dst[i] = src[i];
208 }
209}
210
dbf396d4 211static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
de6d9b64 212{
88bd7fdc 213 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
de6d9b64 214 int it, i, scale;
de6d9b64 215 int err_inf, v;
dbf396d4 216 int64_t err2, ti, ti1, it1, err_sum = 0;
36fa9ef3
MR
217 int64_t sysErr[64], sysErrMax = 0;
218 int maxout = 0;
219 int blockSumErrMax = 0, blockSumErr;
64bde197 220 AVLFG prng;
dbf396d4
MR
221 double omse, ome;
222 int spec_err;
de6d9b64 223
64bde197 224 av_lfg_init(&prng, 1);
de6d9b64
FB
225
226 err_inf = 0;
227 err2 = 0;
36fa9ef3
MR
228 for (i = 0; i < 64; i++)
229 sysErr[i] = 0;
230 for (it = 0; it < NB_ITS; it++) {
ae2e8971
MR
231 init_block(block1, test, is_idct, &prng);
232 permute(block, block1, dct->format);
9e1586fc 233
4f905a65 234 dct->func(block);
db7d8fb4 235 emms_c();
9e1586fc 236
4f905a65 237 if (dct->format == SCALE_PERM) {
36fa9ef3
MR
238 for (i = 0; i < 64; i++) {
239 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
240 block[i] = (block[i] * scale) >> AANSCALE_BITS;
86748dbc
MN
241 }
242 }
243
74965f26 244 ref(block1);
de6d9b64 245
36fa9ef3
MR
246 blockSumErr = 0;
247 for (i = 0; i < 64; i++) {
dbf396d4
MR
248 int err = block[i] - block1[i];
249 err_sum += err;
250 v = abs(err);
de6d9b64
FB
251 if (v > err_inf)
252 err_inf = v;
253 err2 += v * v;
bb270c08
DB
254 sysErr[i] += block[i] - block1[i];
255 blockSumErr += v;
36fa9ef3
MR
256 if (abs(block[i]) > maxout)
257 maxout = abs(block[i]);
de6d9b64 258 }
36fa9ef3
MR
259 if (blockSumErrMax < blockSumErr)
260 blockSumErrMax = blockSumErr;
86748dbc 261 }
36fa9ef3
MR
262 for (i = 0; i < 64; i++)
263 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
115329f1 264
36fa9ef3
MR
265 for (i = 0; i < 64; i++) {
266 if (i % 8 == 0)
267 printf("\n");
268 printf("%7d ", (int) sysErr[i]);
de6d9b64 269 }
86748dbc 270 printf("\n");
115329f1 271
dbf396d4
MR
272 omse = (double) err2 / NB_ITS / 64;
273 ome = (double) err_sum / NB_ITS / 64;
274
275 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
276
277 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
cf2b4f88 278 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
dbf396d4 279 omse, ome, (double) sysErrMax / NB_ITS,
36fa9ef3 280 maxout, blockSumErrMax);
e6ff0648 281
dbf396d4
MR
282 if (spec_err && !dct->nonspec)
283 return 1;
284
7fd2c138 285 if (!speed)
dbf396d4 286 return 0;
7fd2c138 287
de6d9b64 288 /* speed test */
ae2e8971
MR
289 init_block(block, test, is_idct, &prng);
290 permute(block1, block, dct->format);
9e1586fc 291
980f81d9 292 ti = av_gettime();
de6d9b64
FB
293 it1 = 0;
294 do {
36fa9ef3 295 for (it = 0; it < NB_ITS_SPEED; it++) {
ae2e8971 296 memcpy(block, block1, sizeof(block));
4f905a65 297 dct->func(block);
de6d9b64
FB
298 }
299 it1 += NB_ITS_SPEED;
980f81d9 300 ti1 = av_gettime() - ti;
de6d9b64 301 } while (ti1 < 1000000);
db7d8fb4 302 emms_c();
de6d9b64 303
cf2b4f88 304 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
36fa9ef3 305 (double) it1 * 1000.0 / (double) ti1);
dbf396d4
MR
306
307 return 0;
de6d9b64
FB
308}
309
c6727809
MR
310DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
311DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
a46a3ce4 312
504ffed1 313static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
a46a3ce4
FB
314{
315 static int init;
316 static double c8[8][8];
317 static double c4[4][4];
318 double block1[64], block2[64], block3[64];
319 double s, sum, v;
320 int i, j, k;
321
322 if (!init) {
323 init = 1;
324
36fa9ef3 325 for (i = 0; i < 8; i++) {
a46a3ce4 326 sum = 0;
36fa9ef3
MR
327 for (j = 0; j < 8; j++) {
328 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
a46a3ce4
FB
329 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
330 sum += c8[i][j] * c8[i][j];
331 }
332 }
115329f1 333
36fa9ef3 334 for (i = 0; i < 4; i++) {
a46a3ce4 335 sum = 0;
36fa9ef3
MR
336 for (j = 0; j < 4; j++) {
337 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
a46a3ce4
FB
338 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
339 sum += c4[i][j] * c4[i][j];
340 }
341 }
342 }
343
344 /* butterfly */
652f0197 345 s = 0.5 * sqrt(2.0);
36fa9ef3
MR
346 for (i = 0; i < 4; i++) {
347 for (j = 0; j < 8; j++) {
348 block1[8 * (2 * i) + j] =
349 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
350 block1[8 * (2 * i + 1) + j] =
351 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
a46a3ce4
FB
352 }
353 }
354
355 /* idct8 on lines */
36fa9ef3
MR
356 for (i = 0; i < 8; i++) {
357 for (j = 0; j < 8; j++) {
a46a3ce4 358 sum = 0;
36fa9ef3
MR
359 for (k = 0; k < 8; k++)
360 sum += c8[k][j] * block1[8 * i + k];
361 block2[8 * i + j] = sum;
a46a3ce4
FB
362 }
363 }
364
365 /* idct4 */
36fa9ef3
MR
366 for (i = 0; i < 8; i++) {
367 for (j = 0; j < 4; j++) {
a46a3ce4
FB
368 /* top */
369 sum = 0;
36fa9ef3
MR
370 for (k = 0; k < 4; k++)
371 sum += c4[k][j] * block2[8 * (2 * k) + i];
372 block3[8 * (2 * j) + i] = sum;
a46a3ce4
FB
373
374 /* bottom */
375 sum = 0;
36fa9ef3
MR
376 for (k = 0; k < 4; k++)
377 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
378 block3[8 * (2 * j + 1) + i] = sum;
a46a3ce4
FB
379 }
380 }
381
382 /* clamp and store the result */
36fa9ef3
MR
383 for (i = 0; i < 8; i++) {
384 for (j = 0; j < 8; j++) {
385 v = block3[8 * i + j];
386 if (v < 0) v = 0;
387 else if (v > 255) v = 255;
388 dest[i * linesize + j] = (int) rint(v);
a46a3ce4
FB
389 }
390 }
391}
392
504ffed1 393static void idct248_error(const char *name,
36fa9ef3 394 void (*idct248_put)(uint8_t *dest, int line_size,
7fd2c138
MR
395 int16_t *block),
396 int speed)
a46a3ce4
FB
397{
398 int it, i, it1, ti, ti1, err_max, v;
64bde197 399 AVLFG prng;
294eaa26 400
64bde197 401 av_lfg_init(&prng, 1);
115329f1 402
a46a3ce4
FB
403 /* just one test to see if code is correct (precision is less
404 important here) */
405 err_max = 0;
36fa9ef3 406 for (it = 0; it < NB_ITS; it++) {
652f0197 407 /* XXX: use forward transform to generate values */
36fa9ef3 408 for (i = 0; i < 64; i++)
64bde197 409 block1[i] = av_lfg_get(&prng) % 256 - 128;
652f0197
FB
410 block1[0] += 1024;
411
36fa9ef3
MR
412 for (i = 0; i < 64; i++)
413 block[i] = block1[i];
a46a3ce4 414 idct248_ref(img_dest1, 8, block);
115329f1 415
36fa9ef3
MR
416 for (i = 0; i < 64; i++)
417 block[i] = block1[i];
652f0197 418 idct248_put(img_dest, 8, block);
115329f1 419
36fa9ef3
MR
420 for (i = 0; i < 64; i++) {
421 v = abs((int) img_dest[i] - (int) img_dest1[i]);
652f0197
FB
422 if (v == 255)
423 printf("%d %d\n", img_dest[i], img_dest1[i]);
424 if (v > err_max)
425 err_max = v;
426 }
a46a3ce4 427 }
36fa9ef3 428 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
a46a3ce4 429
7fd2c138
MR
430 if (!speed)
431 return;
432
980f81d9 433 ti = av_gettime();
a46a3ce4
FB
434 it1 = 0;
435 do {
36fa9ef3
MR
436 for (it = 0; it < NB_ITS_SPEED; it++) {
437 for (i = 0; i < 64; i++)
438 block[i] = block1[i];
a46a3ce4
FB
439 idct248_put(img_dest, 8, block);
440 }
441 it1 += NB_ITS_SPEED;
980f81d9 442 ti1 = av_gettime() - ti;
a46a3ce4 443 } while (ti1 < 1000000);
db7d8fb4 444 emms_c();
a46a3ce4 445
36fa9ef3
MR
446 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
447 (double) it1 * 1000.0 / (double) ti1);
a46a3ce4
FB
448}
449
504ffed1 450static void help(void)
9e1586fc 451{
86748dbc
MN
452 printf("dct-test [-i] [<test-number>]\n"
453 "test-number 0 -> test with random matrixes\n"
454 " 1 -> test with random sparse matrixes\n"
455 " 2 -> do 3. test from mpeg4 std\n"
a46a3ce4 456 "-i test IDCT implementations\n"
7fd2c138
MR
457 "-4 test IDCT248 implementations\n"
458 "-t speed test\n");
9e1586fc
FB
459}
460
667fb97a
RB
461#if !HAVE_GETOPT
462#include "compat/getopt.c"
463#endif
464
de6d9b64
FB
465int main(int argc, char **argv)
466{
a46a3ce4 467 int test_idct = 0, test_248_dct = 0;
36fa9ef3
MR
468 int c, i;
469 int test = 1;
7fd2c138 470 int speed = 0;
dbf396d4 471 int err = 0;
36fa9ef3 472
c6c98d08 473 cpu_flags = av_get_cpu_flags();
9e1586fc 474
0de74546 475 ff_ref_dct_init();
9e1586fc 476 idct_mmx_init();
f67a10cd 477
36fa9ef3 478 for (;;) {
7fd2c138 479 c = getopt(argc, argv, "ih4t");
9e1586fc
FB
480 if (c == -1)
481 break;
36fa9ef3 482 switch (c) {
9e1586fc
FB
483 case 'i':
484 test_idct = 1;
485 break;
a46a3ce4
FB
486 case '4':
487 test_248_dct = 1;
488 break;
7fd2c138
MR
489 case 't':
490 speed = 1;
491 break;
36fa9ef3 492 default:
9e1586fc
FB
493 case 'h':
494 help();
c6bdc908 495 return 0;
9e1586fc
FB
496 }
497 }
115329f1 498
36fa9ef3
MR
499 if (optind < argc)
500 test = atoi(argv[optind]);
115329f1 501
f36b3902 502 printf("Libav DCT/IDCT test\n");
9e1586fc 503
a46a3ce4 504 if (test_248_dct) {
7fd2c138 505 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
9e1586fc 506 } else {
4b357756 507 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
36fa9ef3 508 for (i = 0; algos[i].name; i++)
4b357756 509 if (!(~cpu_flags & algos[i].mm_support)) {
dbf396d4 510 err |= dct_error(&algos[i], test, test_idct, speed);
36fa9ef3 511 }
9e1586fc 512 }
dbf396d4 513
5331d2b9
DB
514 if (err)
515 printf("Error: %d.\n", err);
516
517 return !!err;
de6d9b64 518}