4f0a0c69964f2152adab7c3c3534d0ca5dc5abb1
[libav.git] / libavcodec / dct-test.c
1 /*
2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 /**
23 * @file
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 #include <math.h>
34
35 #include "libavutil/common.h"
36 #include "libavutil/lfg.h"
37
38 #include "simple_idct.h"
39 #include "aandcttab.h"
40 #include "faandct.h"
41 #include "faanidct.h"
42 #include "x86/idct_xvid.h"
43 #include "dctref.h"
44
45 #undef printf
46
47 void ff_mmx_idct(DCTELEM *data);
48 void ff_mmxext_idct(DCTELEM *data);
49
50 void odivx_idct_c(short *block);
51
52 // BFIN
53 void ff_bfin_idct(DCTELEM *block);
54 void ff_bfin_fdct(DCTELEM *block);
55
56 // ALTIVEC
57 void fdct_altivec(DCTELEM *block);
58 //void idct_altivec(DCTELEM *block);?? no routine
59
60 // ARM
61 void ff_j_rev_dct_arm(DCTELEM *data);
62 void ff_simple_idct_arm(DCTELEM *data);
63 void ff_simple_idct_armv5te(DCTELEM *data);
64 void ff_simple_idct_armv6(DCTELEM *data);
65 void ff_simple_idct_neon(DCTELEM *data);
66
67 void ff_simple_idct_axp(DCTELEM *data);
68
69 struct algo {
70 const char *name;
71 enum { FDCT, IDCT } is_idct;
72 void (* func) (DCTELEM *block);
73 void (* ref) (DCTELEM *block);
74 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
75 int mm_support;
76 };
77
78 #ifndef FAAN_POSTSCALE
79 #define FAAN_SCALE SCALE_PERM
80 #else
81 #define FAAN_SCALE NO_PERM
82 #endif
83
84 static int cpu_flags;
85
86 struct algo algos[] = {
87 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
88 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
89 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
90 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
91 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
92 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
93 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
94 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
95
96 #if HAVE_MMX
97 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, FF_MM_MMX},
98 #if HAVE_MMX2
99 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, FF_MM_MMX2},
100 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, FF_MM_SSE2},
101 #endif
102
103 #if CONFIG_GPL
104 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX},
105 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX2},
106 #endif
107 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, FF_MM_MMX},
108 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, FF_MM_MMX},
109 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, FF_MM_MMX2},
110 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, FF_MM_SSE2},
111 #endif
112
113 #if HAVE_ALTIVEC
114 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, FF_MM_ALTIVEC},
115 #endif
116
117 #if ARCH_BFIN
118 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
119 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
120 #endif
121
122 #if ARCH_ARM
123 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
124 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
125 #if HAVE_ARMV5TE
126 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
127 #endif
128 #if HAVE_ARMV6
129 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
130 #endif
131 #if HAVE_NEON
132 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
133 #endif
134 #endif /* ARCH_ARM */
135
136 #if ARCH_ALPHA
137 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
138 #endif
139
140 { 0 }
141 };
142
143 #define AANSCALE_BITS 12
144
145 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
146
147 static int64_t gettime(void)
148 {
149 struct timeval tv;
150 gettimeofday(&tv,NULL);
151 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
152 }
153
154 #define NB_ITS 20000
155 #define NB_ITS_SPEED 50000
156
157 static short idct_mmx_perm[64];
158
159 static short idct_simple_mmx_perm[64]={
160 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
161 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
162 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
163 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
164 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
165 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
166 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
167 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
168 };
169
170 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
171
172 static void idct_mmx_init(void)
173 {
174 int i;
175
176 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
177 for (i = 0; i < 64; i++) {
178 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
179 // idct_simple_mmx_perm[i] = simple_block_permute_op(i);
180 }
181 }
182
183 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
184 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
185 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
186
187 static inline void mmx_emms(void)
188 {
189 #if HAVE_MMX
190 if (cpu_flags & FF_MM_MMX)
191 __asm__ volatile ("emms\n\t");
192 #endif
193 }
194
195 static void dct_error(const char *name, int is_idct,
196 void (*fdct_func)(DCTELEM *block),
197 void (*fdct_ref)(DCTELEM *block), int form, int test)
198 {
199 int it, i, scale;
200 int err_inf, v;
201 int64_t err2, ti, ti1, it1;
202 int64_t sysErr[64], sysErrMax=0;
203 int maxout=0;
204 int blockSumErrMax=0, blockSumErr;
205 AVLFG prng;
206
207 av_lfg_init(&prng, 1);
208
209 err_inf = 0;
210 err2 = 0;
211 for(i=0; i<64; i++) sysErr[i]=0;
212 for(it=0;it<NB_ITS;it++) {
213 for(i=0;i<64;i++)
214 block1[i] = 0;
215 switch(test){
216 case 0:
217 for(i=0;i<64;i++)
218 block1[i] = (av_lfg_get(&prng) % 512) -256;
219 if (is_idct){
220 ff_ref_fdct(block1);
221
222 for(i=0;i<64;i++)
223 block1[i]>>=3;
224 }
225 break;
226 case 1:{
227 int num = av_lfg_get(&prng) % 10 + 1;
228 for(i=0;i<num;i++)
229 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256;
230 }break;
231 case 2:
232 block1[0] = av_lfg_get(&prng) % 4096 - 2048;
233 block1[63]= (block1[0]&1)^1;
234 break;
235 }
236
237 #if 0 // simulate mismatch control
238 { int sum=0;
239 for(i=0;i<64;i++)
240 sum+=block1[i];
241
242 if((sum&1)==0) block1[63]^=1;
243 }
244 #endif
245
246 for(i=0; i<64; i++)
247 block_org[i]= block1[i];
248
249 if (form == MMX_PERM) {
250 for(i=0;i<64;i++)
251 block[idct_mmx_perm[i]] = block1[i];
252 } else if (form == MMX_SIMPLE_PERM) {
253 for(i=0;i<64;i++)
254 block[idct_simple_mmx_perm[i]] = block1[i];
255
256 } else if (form == SSE2_PERM) {
257 for(i=0; i<64; i++)
258 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
259 } else if (form == PARTTRANS_PERM) {
260 for(i=0; i<64; i++)
261 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
262 } else {
263 for(i=0; i<64; i++)
264 block[i]= block1[i];
265 }
266 #if 0 // simulate mismatch control for tested IDCT but not the ref
267 { int sum=0;
268 for(i=0;i<64;i++)
269 sum+=block[i];
270
271 if((sum&1)==0) block[63]^=1;
272 }
273 #endif
274
275 fdct_func(block);
276 mmx_emms();
277
278 if (form == SCALE_PERM) {
279 for(i=0; i<64; i++) {
280 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
281 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
282 }
283 }
284
285 fdct_ref(block1);
286
287 blockSumErr=0;
288 for(i=0;i<64;i++) {
289 v = abs(block[i] - block1[i]);
290 if (v > err_inf)
291 err_inf = v;
292 err2 += v * v;
293 sysErr[i] += block[i] - block1[i];
294 blockSumErr += v;
295 if( abs(block[i])>maxout) maxout=abs(block[i]);
296 }
297 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
298 #if 0 // print different matrix pairs
299 if(blockSumErr){
300 printf("\n");
301 for(i=0; i<64; i++){
302 if((i&7)==0) printf("\n");
303 printf("%4d ", block_org[i]);
304 }
305 for(i=0; i<64; i++){
306 if((i&7)==0) printf("\n");
307 printf("%4d ", block[i] - block1[i]);
308 }
309 }
310 #endif
311 }
312 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
313
314 #if 1 // dump systematic errors
315 for(i=0; i<64; i++){
316 if(i%8==0) printf("\n");
317 printf("%7d ", (int)sysErr[i]);
318 }
319 printf("\n");
320 #endif
321
322 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
323 is_idct ? "IDCT" : "DCT",
324 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
325 #if 1 //Speed test
326 /* speed test */
327 for(i=0;i<64;i++)
328 block1[i] = 0;
329 switch(test){
330 case 0:
331 for(i=0;i<64;i++)
332 block1[i] = av_lfg_get(&prng) % 512 -256;
333 if (is_idct){
334 ff_ref_fdct(block1);
335
336 for(i=0;i<64;i++)
337 block1[i]>>=3;
338 }
339 break;
340 case 1:{
341 case 2:
342 block1[0] = av_lfg_get(&prng) % 512 -256;
343 block1[1] = av_lfg_get(&prng) % 512 -256;
344 block1[2] = av_lfg_get(&prng) % 512 -256;
345 block1[3] = av_lfg_get(&prng) % 512 -256;
346 }break;
347 }
348
349 if (form == MMX_PERM) {
350 for(i=0;i<64;i++)
351 block[idct_mmx_perm[i]] = block1[i];
352 } else if(form == MMX_SIMPLE_PERM) {
353 for(i=0;i<64;i++)
354 block[idct_simple_mmx_perm[i]] = block1[i];
355 } else {
356 for(i=0; i<64; i++)
357 block[i]= block1[i];
358 }
359
360 ti = gettime();
361 it1 = 0;
362 do {
363 for(it=0;it<NB_ITS_SPEED;it++) {
364 for(i=0; i<64; i++)
365 block[i]= block1[i];
366 // memcpy(block, block1, sizeof(DCTELEM) * 64);
367 // do not memcpy especially not fastmemcpy because it does movntq !!!
368 fdct_func(block);
369 }
370 it1 += NB_ITS_SPEED;
371 ti1 = gettime() - ti;
372 } while (ti1 < 1000000);
373 mmx_emms();
374
375 printf("%s %s: %0.1f kdct/s\n",
376 is_idct ? "IDCT" : "DCT",
377 name, (double)it1 * 1000.0 / (double)ti1);
378 #endif
379 }
380
381 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
382 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
383
384 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
385 {
386 static int init;
387 static double c8[8][8];
388 static double c4[4][4];
389 double block1[64], block2[64], block3[64];
390 double s, sum, v;
391 int i, j, k;
392
393 if (!init) {
394 init = 1;
395
396 for(i=0;i<8;i++) {
397 sum = 0;
398 for(j=0;j<8;j++) {
399 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
400 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
401 sum += c8[i][j] * c8[i][j];
402 }
403 }
404
405 for(i=0;i<4;i++) {
406 sum = 0;
407 for(j=0;j<4;j++) {
408 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
409 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
410 sum += c4[i][j] * c4[i][j];
411 }
412 }
413 }
414
415 /* butterfly */
416 s = 0.5 * sqrt(2.0);
417 for(i=0;i<4;i++) {
418 for(j=0;j<8;j++) {
419 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
420 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
421 }
422 }
423
424 /* idct8 on lines */
425 for(i=0;i<8;i++) {
426 for(j=0;j<8;j++) {
427 sum = 0;
428 for(k=0;k<8;k++)
429 sum += c8[k][j] * block1[8*i+k];
430 block2[8*i+j] = sum;
431 }
432 }
433
434 /* idct4 */
435 for(i=0;i<8;i++) {
436 for(j=0;j<4;j++) {
437 /* top */
438 sum = 0;
439 for(k=0;k<4;k++)
440 sum += c4[k][j] * block2[8*(2*k)+i];
441 block3[8*(2*j)+i] = sum;
442
443 /* bottom */
444 sum = 0;
445 for(k=0;k<4;k++)
446 sum += c4[k][j] * block2[8*(2*k+1)+i];
447 block3[8*(2*j+1)+i] = sum;
448 }
449 }
450
451 /* clamp and store the result */
452 for(i=0;i<8;i++) {
453 for(j=0;j<8;j++) {
454 v = block3[8*i+j];
455 if (v < 0)
456 v = 0;
457 else if (v > 255)
458 v = 255;
459 dest[i * linesize + j] = (int)rint(v);
460 }
461 }
462 }
463
464 static void idct248_error(const char *name,
465 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
466 {
467 int it, i, it1, ti, ti1, err_max, v;
468
469 AVLFG prng;
470
471 av_lfg_init(&prng, 1);
472
473 /* just one test to see if code is correct (precision is less
474 important here) */
475 err_max = 0;
476 for(it=0;it<NB_ITS;it++) {
477
478 /* XXX: use forward transform to generate values */
479 for(i=0;i<64;i++)
480 block1[i] = av_lfg_get(&prng) % 256 - 128;
481 block1[0] += 1024;
482
483 for(i=0; i<64; i++)
484 block[i]= block1[i];
485 idct248_ref(img_dest1, 8, block);
486
487 for(i=0; i<64; i++)
488 block[i]= block1[i];
489 idct248_put(img_dest, 8, block);
490
491 for(i=0;i<64;i++) {
492 v = abs((int)img_dest[i] - (int)img_dest1[i]);
493 if (v == 255)
494 printf("%d %d\n", img_dest[i], img_dest1[i]);
495 if (v > err_max)
496 err_max = v;
497 }
498 #if 0
499 printf("ref=\n");
500 for(i=0;i<8;i++) {
501 int j;
502 for(j=0;j<8;j++) {
503 printf(" %3d", img_dest1[i*8+j]);
504 }
505 printf("\n");
506 }
507
508 printf("out=\n");
509 for(i=0;i<8;i++) {
510 int j;
511 for(j=0;j<8;j++) {
512 printf(" %3d", img_dest[i*8+j]);
513 }
514 printf("\n");
515 }
516 #endif
517 }
518 printf("%s %s: err_inf=%d\n",
519 1 ? "IDCT248" : "DCT248",
520 name, err_max);
521
522 ti = gettime();
523 it1 = 0;
524 do {
525 for(it=0;it<NB_ITS_SPEED;it++) {
526 for(i=0; i<64; i++)
527 block[i]= block1[i];
528 // memcpy(block, block1, sizeof(DCTELEM) * 64);
529 // do not memcpy especially not fastmemcpy because it does movntq !!!
530 idct248_put(img_dest, 8, block);
531 }
532 it1 += NB_ITS_SPEED;
533 ti1 = gettime() - ti;
534 } while (ti1 < 1000000);
535 mmx_emms();
536
537 printf("%s %s: %0.1f kdct/s\n",
538 1 ? "IDCT248" : "DCT248",
539 name, (double)it1 * 1000.0 / (double)ti1);
540 }
541
542 static void help(void)
543 {
544 printf("dct-test [-i] [<test-number>]\n"
545 "test-number 0 -> test with random matrixes\n"
546 " 1 -> test with random sparse matrixes\n"
547 " 2 -> do 3. test from mpeg4 std\n"
548 "-i test IDCT implementations\n"
549 "-4 test IDCT248 implementations\n");
550 }
551
552 int main(int argc, char **argv)
553 {
554 int test_idct = 0, test_248_dct = 0;
555 int c,i;
556 int test=1;
557 cpu_flags = mm_support();
558
559 ff_ref_dct_init();
560 idct_mmx_init();
561
562 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
563 for(i=0;i<MAX_NEG_CROP;i++) {
564 cropTbl[i] = 0;
565 cropTbl[i + MAX_NEG_CROP + 256] = 255;
566 }
567
568 for(;;) {
569 c = getopt(argc, argv, "ih4");
570 if (c == -1)
571 break;
572 switch(c) {
573 case 'i':
574 test_idct = 1;
575 break;
576 case '4':
577 test_248_dct = 1;
578 break;
579 default :
580 case 'h':
581 help();
582 return 0;
583 }
584 }
585
586 if(optind <argc) test= atoi(argv[optind]);
587
588 printf("ffmpeg DCT/IDCT test\n");
589
590 if (test_248_dct) {
591 idct248_error("SIMPLE-C", ff_simple_idct248_put);
592 } else {
593 for (i=0;algos[i].name;i++)
594 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
595 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);
596 }
597 }
598 return 0;
599 }