dct-test: use emms_c() from libavutil instead of duplicating it
[libav.git] / libavcodec / dct-test.c
1 /*
2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
4 *
5 * This file is part of Libav.
6 *
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 /**
23 * @file
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 #include <math.h>
34
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
38
39 #include "simple_idct.h"
40 #include "aandcttab.h"
41 #include "faandct.h"
42 #include "faanidct.h"
43 #include "x86/idct_xvid.h"
44 #include "dctref.h"
45
46 #undef printf
47
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
50
51 // BFIN
52 void ff_bfin_idct(DCTELEM *block);
53 void ff_bfin_fdct(DCTELEM *block);
54
55 // ALTIVEC
56 void ff_fdct_altivec(DCTELEM *block);
57 //void ff_idct_altivec(DCTELEM *block);?? no routine
58
59 // ARM
60 void ff_j_rev_dct_arm(DCTELEM *data);
61 void ff_simple_idct_arm(DCTELEM *data);
62 void ff_simple_idct_armv5te(DCTELEM *data);
63 void ff_simple_idct_armv6(DCTELEM *data);
64 void ff_simple_idct_neon(DCTELEM *data);
65
66 void ff_simple_idct_axp(DCTELEM *data);
67
68 struct algo {
69 const char *name;
70 void (*func)(DCTELEM *block);
71 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
72 SSE2_PERM, PARTTRANS_PERM } format;
73 int mm_support;
74 int nonspec;
75 };
76
77 static int cpu_flags;
78
79 static const struct algo fdct_tab[] = {
80 { "REF-DBL", ff_ref_fdct, NO_PERM },
81 { "FAAN", ff_faandct, NO_PERM },
82 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
83 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
84
85 #if HAVE_MMX
86 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
87 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
88 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
89 #endif
90
91 #if HAVE_ALTIVEC
92 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
93 #endif
94
95 #if ARCH_BFIN
96 { "BFINfdct", ff_bfin_fdct, NO_PERM },
97 #endif
98
99 { 0 }
100 };
101
102 static const struct algo idct_tab[] = {
103 { "FAANI", ff_faanidct, NO_PERM },
104 { "REF-DBL", ff_ref_idct, NO_PERM },
105 { "INT", ff_j_rev_dct, MMX_PERM },
106 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
107
108 #if HAVE_MMX
109 #if CONFIG_GPL
110 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
111 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
112 #endif
113 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
114 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
115 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
116 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
117 #endif
118
119 #if ARCH_BFIN
120 { "BFINidct", ff_bfin_idct, NO_PERM },
121 #endif
122
123 #if ARCH_ARM
124 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
125 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
126 #endif
127 #if HAVE_ARMV5TE
128 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
129 #endif
130 #if HAVE_ARMV6
131 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
132 #endif
133 #if HAVE_NEON
134 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
135 #endif
136
137 #if ARCH_ALPHA
138 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
139 #endif
140
141 { 0 }
142 };
143
144 #define AANSCALE_BITS 12
145
146 static int64_t gettime(void)
147 {
148 struct timeval tv;
149 gettimeofday(&tv, NULL);
150 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
151 }
152
153 #define NB_ITS 20000
154 #define NB_ITS_SPEED 50000
155
156 static short idct_mmx_perm[64];
157
158 static short idct_simple_mmx_perm[64] = {
159 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
160 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
161 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
162 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
163 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
164 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
165 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
166 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
167 };
168
169 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
170
171 static void idct_mmx_init(void)
172 {
173 int i;
174
175 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
176 for (i = 0; i < 64; i++) {
177 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
178 }
179 }
180
181 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
182 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
183
184 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
185 {
186 int i, j;
187
188 memset(block, 0, 64 * sizeof(*block));
189
190 switch (test) {
191 case 0:
192 for (i = 0; i < 64; i++)
193 block[i] = (av_lfg_get(prng) % 512) - 256;
194 if (is_idct) {
195 ff_ref_fdct(block);
196 for (i = 0; i < 64; i++)
197 block[i] >>= 3;
198 }
199 break;
200 case 1:
201 j = av_lfg_get(prng) % 10 + 1;
202 for (i = 0; i < j; i++)
203 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
204 break;
205 case 2:
206 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
207 block[63] = (block[0] & 1) ^ 1;
208 break;
209 }
210 }
211
212 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
213 {
214 int i;
215
216 if (perm == MMX_PERM) {
217 for (i = 0; i < 64; i++)
218 dst[idct_mmx_perm[i]] = src[i];
219 } else if (perm == MMX_SIMPLE_PERM) {
220 for (i = 0; i < 64; i++)
221 dst[idct_simple_mmx_perm[i]] = src[i];
222 } else if (perm == SSE2_PERM) {
223 for (i = 0; i < 64; i++)
224 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
225 } else if (perm == PARTTRANS_PERM) {
226 for (i = 0; i < 64; i++)
227 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
228 } else {
229 for (i = 0; i < 64; i++)
230 dst[i] = src[i];
231 }
232 }
233
234 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
235 {
236 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
237 int it, i, scale;
238 int err_inf, v;
239 int64_t err2, ti, ti1, it1, err_sum = 0;
240 int64_t sysErr[64], sysErrMax = 0;
241 int maxout = 0;
242 int blockSumErrMax = 0, blockSumErr;
243 AVLFG prng;
244 double omse, ome;
245 int spec_err;
246
247 av_lfg_init(&prng, 1);
248
249 err_inf = 0;
250 err2 = 0;
251 for (i = 0; i < 64; i++)
252 sysErr[i] = 0;
253 for (it = 0; it < NB_ITS; it++) {
254 init_block(block1, test, is_idct, &prng);
255 permute(block, block1, dct->format);
256
257 dct->func(block);
258 emms_c();
259
260 if (dct->format == SCALE_PERM) {
261 for (i = 0; i < 64; i++) {
262 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
263 block[i] = (block[i] * scale) >> AANSCALE_BITS;
264 }
265 }
266
267 ref(block1);
268
269 blockSumErr = 0;
270 for (i = 0; i < 64; i++) {
271 int err = block[i] - block1[i];
272 err_sum += err;
273 v = abs(err);
274 if (v > err_inf)
275 err_inf = v;
276 err2 += v * v;
277 sysErr[i] += block[i] - block1[i];
278 blockSumErr += v;
279 if (abs(block[i]) > maxout)
280 maxout = abs(block[i]);
281 }
282 if (blockSumErrMax < blockSumErr)
283 blockSumErrMax = blockSumErr;
284 }
285 for (i = 0; i < 64; i++)
286 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
287
288 for (i = 0; i < 64; i++) {
289 if (i % 8 == 0)
290 printf("\n");
291 printf("%7d ", (int) sysErr[i]);
292 }
293 printf("\n");
294
295 omse = (double) err2 / NB_ITS / 64;
296 ome = (double) err_sum / NB_ITS / 64;
297
298 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
299
300 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
301 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
302 omse, ome, (double) sysErrMax / NB_ITS,
303 maxout, blockSumErrMax);
304
305 if (spec_err && !dct->nonspec)
306 return 1;
307
308 if (!speed)
309 return 0;
310
311 /* speed test */
312 init_block(block, test, is_idct, &prng);
313 permute(block1, block, dct->format);
314
315 ti = gettime();
316 it1 = 0;
317 do {
318 for (it = 0; it < NB_ITS_SPEED; it++) {
319 memcpy(block, block1, sizeof(block));
320 dct->func(block);
321 }
322 it1 += NB_ITS_SPEED;
323 ti1 = gettime() - ti;
324 } while (ti1 < 1000000);
325 emms_c();
326
327 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
328 (double) it1 * 1000.0 / (double) ti1);
329
330 return 0;
331 }
332
333 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
334 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
335
336 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
337 {
338 static int init;
339 static double c8[8][8];
340 static double c4[4][4];
341 double block1[64], block2[64], block3[64];
342 double s, sum, v;
343 int i, j, k;
344
345 if (!init) {
346 init = 1;
347
348 for (i = 0; i < 8; i++) {
349 sum = 0;
350 for (j = 0; j < 8; j++) {
351 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
352 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
353 sum += c8[i][j] * c8[i][j];
354 }
355 }
356
357 for (i = 0; i < 4; i++) {
358 sum = 0;
359 for (j = 0; j < 4; j++) {
360 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
361 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
362 sum += c4[i][j] * c4[i][j];
363 }
364 }
365 }
366
367 /* butterfly */
368 s = 0.5 * sqrt(2.0);
369 for (i = 0; i < 4; i++) {
370 for (j = 0; j < 8; j++) {
371 block1[8 * (2 * i) + j] =
372 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
373 block1[8 * (2 * i + 1) + j] =
374 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
375 }
376 }
377
378 /* idct8 on lines */
379 for (i = 0; i < 8; i++) {
380 for (j = 0; j < 8; j++) {
381 sum = 0;
382 for (k = 0; k < 8; k++)
383 sum += c8[k][j] * block1[8 * i + k];
384 block2[8 * i + j] = sum;
385 }
386 }
387
388 /* idct4 */
389 for (i = 0; i < 8; i++) {
390 for (j = 0; j < 4; j++) {
391 /* top */
392 sum = 0;
393 for (k = 0; k < 4; k++)
394 sum += c4[k][j] * block2[8 * (2 * k) + i];
395 block3[8 * (2 * j) + i] = sum;
396
397 /* bottom */
398 sum = 0;
399 for (k = 0; k < 4; k++)
400 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
401 block3[8 * (2 * j + 1) + i] = sum;
402 }
403 }
404
405 /* clamp and store the result */
406 for (i = 0; i < 8; i++) {
407 for (j = 0; j < 8; j++) {
408 v = block3[8 * i + j];
409 if (v < 0) v = 0;
410 else if (v > 255) v = 255;
411 dest[i * linesize + j] = (int) rint(v);
412 }
413 }
414 }
415
416 static void idct248_error(const char *name,
417 void (*idct248_put)(uint8_t *dest, int line_size,
418 int16_t *block),
419 int speed)
420 {
421 int it, i, it1, ti, ti1, err_max, v;
422 AVLFG prng;
423
424 av_lfg_init(&prng, 1);
425
426 /* just one test to see if code is correct (precision is less
427 important here) */
428 err_max = 0;
429 for (it = 0; it < NB_ITS; it++) {
430 /* XXX: use forward transform to generate values */
431 for (i = 0; i < 64; i++)
432 block1[i] = av_lfg_get(&prng) % 256 - 128;
433 block1[0] += 1024;
434
435 for (i = 0; i < 64; i++)
436 block[i] = block1[i];
437 idct248_ref(img_dest1, 8, block);
438
439 for (i = 0; i < 64; i++)
440 block[i] = block1[i];
441 idct248_put(img_dest, 8, block);
442
443 for (i = 0; i < 64; i++) {
444 v = abs((int) img_dest[i] - (int) img_dest1[i]);
445 if (v == 255)
446 printf("%d %d\n", img_dest[i], img_dest1[i]);
447 if (v > err_max)
448 err_max = v;
449 }
450 }
451 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
452
453 if (!speed)
454 return;
455
456 ti = gettime();
457 it1 = 0;
458 do {
459 for (it = 0; it < NB_ITS_SPEED; it++) {
460 for (i = 0; i < 64; i++)
461 block[i] = block1[i];
462 idct248_put(img_dest, 8, block);
463 }
464 it1 += NB_ITS_SPEED;
465 ti1 = gettime() - ti;
466 } while (ti1 < 1000000);
467 emms_c();
468
469 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
470 (double) it1 * 1000.0 / (double) ti1);
471 }
472
473 static void help(void)
474 {
475 printf("dct-test [-i] [<test-number>]\n"
476 "test-number 0 -> test with random matrixes\n"
477 " 1 -> test with random sparse matrixes\n"
478 " 2 -> do 3. test from mpeg4 std\n"
479 "-i test IDCT implementations\n"
480 "-4 test IDCT248 implementations\n"
481 "-t speed test\n");
482 }
483
484 int main(int argc, char **argv)
485 {
486 int test_idct = 0, test_248_dct = 0;
487 int c, i;
488 int test = 1;
489 int speed = 0;
490 int err = 0;
491
492 cpu_flags = av_get_cpu_flags();
493
494 ff_ref_dct_init();
495 idct_mmx_init();
496
497 for (;;) {
498 c = getopt(argc, argv, "ih4t");
499 if (c == -1)
500 break;
501 switch (c) {
502 case 'i':
503 test_idct = 1;
504 break;
505 case '4':
506 test_248_dct = 1;
507 break;
508 case 't':
509 speed = 1;
510 break;
511 default:
512 case 'h':
513 help();
514 return 0;
515 }
516 }
517
518 if (optind < argc)
519 test = atoi(argv[optind]);
520
521 printf("Libav DCT/IDCT test\n");
522
523 if (test_248_dct) {
524 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
525 } else {
526 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
527 for (i = 0; algos[i].name; i++)
528 if (!(~cpu_flags & algos[i].mm_support)) {
529 err |= dct_error(&algos[i], test, test_idct, speed);
530 }
531 }
532
533 return err;
534 }