d0fe34e1d7af865ae5409cf3f7eea288e9dd6078
[libav.git] / libavcodec / dct-test.c
1 /*
2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
4 *
5 * This file is part of Libav.
6 *
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 /**
23 * @file
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 #include <math.h>
34
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
38
39 #include "simple_idct.h"
40 #include "aandcttab.h"
41 #include "faandct.h"
42 #include "faanidct.h"
43 #include "x86/idct_xvid.h"
44 #include "dctref.h"
45
46 #undef printf
47
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
50
51 void odivx_idct_c(short *block);
52
53 // BFIN
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
56
57 // ALTIVEC
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
60
61 // ARM
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
67
68 void ff_simple_idct_axp(DCTELEM *data);
69
70 struct algo {
71 const char *name;
72 enum { FDCT, IDCT } is_idct;
73 void (* func) (DCTELEM *block);
74 void (* ref) (DCTELEM *block);
75 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
76 int mm_support;
77 };
78
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
81 #else
82 #define FAAN_SCALE NO_PERM
83 #endif
84
85 static int cpu_flags;
86
87 struct algo algos[] = {
88 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
89 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
90 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
91 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
92 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
93 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
94 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
95 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
96
97 #if HAVE_MMX
98 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX},
99 #if HAVE_MMX2
100 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2},
101 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2},
102 #endif
103
104 #if CONFIG_GPL
105 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX},
106 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2},
107 #endif
108 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX},
109 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX},
110 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2},
111 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2},
112 #endif
113
114 #if HAVE_ALTIVEC
115 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC},
116 #endif
117
118 #if ARCH_BFIN
119 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
120 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
121 #endif
122
123 #if ARCH_ARM
124 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
125 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
126 #if HAVE_ARMV5TE
127 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
128 #endif
129 #if HAVE_ARMV6
130 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
131 #endif
132 #if HAVE_NEON
133 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
134 #endif
135 #endif /* ARCH_ARM */
136
137 #if ARCH_ALPHA
138 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
139 #endif
140
141 { 0 }
142 };
143
144 #define AANSCALE_BITS 12
145
146 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
147
148 static int64_t gettime(void)
149 {
150 struct timeval tv;
151 gettimeofday(&tv,NULL);
152 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
153 }
154
155 #define NB_ITS 20000
156 #define NB_ITS_SPEED 50000
157
158 static short idct_mmx_perm[64];
159
160 static short idct_simple_mmx_perm[64]={
161 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
162 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
163 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
164 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
165 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
166 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
167 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
168 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
169 };
170
171 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
172
173 static void idct_mmx_init(void)
174 {
175 int i;
176
177 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
178 for (i = 0; i < 64; i++) {
179 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
180 // idct_simple_mmx_perm[i] = simple_block_permute_op(i);
181 }
182 }
183
184 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
185 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
186 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
187
188 static inline void mmx_emms(void)
189 {
190 #if HAVE_MMX
191 if (cpu_flags & AV_CPU_FLAG_MMX)
192 __asm__ volatile ("emms\n\t");
193 #endif
194 }
195
196 static void dct_error(const char *name, int is_idct,
197 void (*fdct_func)(DCTELEM *block),
198 void (*fdct_ref)(DCTELEM *block), int form, int test)
199 {
200 int it, i, scale;
201 int err_inf, v;
202 int64_t err2, ti, ti1, it1;
203 int64_t sysErr[64], sysErrMax=0;
204 int maxout=0;
205 int blockSumErrMax=0, blockSumErr;
206 AVLFG prng;
207
208 av_lfg_init(&prng, 1);
209
210 err_inf = 0;
211 err2 = 0;
212 for(i=0; i<64; i++) sysErr[i]=0;
213 for(it=0;it<NB_ITS;it++) {
214 for(i=0;i<64;i++)
215 block1[i] = 0;
216 switch(test){
217 case 0:
218 for(i=0;i<64;i++)
219 block1[i] = (av_lfg_get(&prng) % 512) -256;
220 if (is_idct){
221 ff_ref_fdct(block1);
222
223 for(i=0;i<64;i++)
224 block1[i]>>=3;
225 }
226 break;
227 case 1:{
228 int num = av_lfg_get(&prng) % 10 + 1;
229 for(i=0;i<num;i++)
230 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256;
231 }break;
232 case 2:
233 block1[0] = av_lfg_get(&prng) % 4096 - 2048;
234 block1[63]= (block1[0]&1)^1;
235 break;
236 }
237
238 #if 0 // simulate mismatch control
239 { int sum=0;
240 for(i=0;i<64;i++)
241 sum+=block1[i];
242
243 if((sum&1)==0) block1[63]^=1;
244 }
245 #endif
246
247 for(i=0; i<64; i++)
248 block_org[i]= block1[i];
249
250 if (form == MMX_PERM) {
251 for(i=0;i<64;i++)
252 block[idct_mmx_perm[i]] = block1[i];
253 } else if (form == MMX_SIMPLE_PERM) {
254 for(i=0;i<64;i++)
255 block[idct_simple_mmx_perm[i]] = block1[i];
256
257 } else if (form == SSE2_PERM) {
258 for(i=0; i<64; i++)
259 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
260 } else if (form == PARTTRANS_PERM) {
261 for(i=0; i<64; i++)
262 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
263 } else {
264 for(i=0; i<64; i++)
265 block[i]= block1[i];
266 }
267 #if 0 // simulate mismatch control for tested IDCT but not the ref
268 { int sum=0;
269 for(i=0;i<64;i++)
270 sum+=block[i];
271
272 if((sum&1)==0) block[63]^=1;
273 }
274 #endif
275
276 fdct_func(block);
277 mmx_emms();
278
279 if (form == SCALE_PERM) {
280 for(i=0; i<64; i++) {
281 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
282 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
283 }
284 }
285
286 fdct_ref(block1);
287
288 blockSumErr=0;
289 for(i=0;i<64;i++) {
290 v = abs(block[i] - block1[i]);
291 if (v > err_inf)
292 err_inf = v;
293 err2 += v * v;
294 sysErr[i] += block[i] - block1[i];
295 blockSumErr += v;
296 if( abs(block[i])>maxout) maxout=abs(block[i]);
297 }
298 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
299 #if 0 // print different matrix pairs
300 if(blockSumErr){
301 printf("\n");
302 for(i=0; i<64; i++){
303 if((i&7)==0) printf("\n");
304 printf("%4d ", block_org[i]);
305 }
306 for(i=0; i<64; i++){
307 if((i&7)==0) printf("\n");
308 printf("%4d ", block[i] - block1[i]);
309 }
310 }
311 #endif
312 }
313 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
314
315 for(i=0; i<64; i++){
316 if(i%8==0) printf("\n");
317 printf("%7d ", (int)sysErr[i]);
318 }
319 printf("\n");
320
321 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
322 is_idct ? "IDCT" : "DCT",
323 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
324
325 /* speed test */
326 for(i=0;i<64;i++)
327 block1[i] = 0;
328 switch(test){
329 case 0:
330 for(i=0;i<64;i++)
331 block1[i] = av_lfg_get(&prng) % 512 -256;
332 if (is_idct){
333 ff_ref_fdct(block1);
334
335 for(i=0;i<64;i++)
336 block1[i]>>=3;
337 }
338 break;
339 case 1:{
340 case 2:
341 block1[0] = av_lfg_get(&prng) % 512 -256;
342 block1[1] = av_lfg_get(&prng) % 512 -256;
343 block1[2] = av_lfg_get(&prng) % 512 -256;
344 block1[3] = av_lfg_get(&prng) % 512 -256;
345 }break;
346 }
347
348 if (form == MMX_PERM) {
349 for(i=0;i<64;i++)
350 block[idct_mmx_perm[i]] = block1[i];
351 } else if(form == MMX_SIMPLE_PERM) {
352 for(i=0;i<64;i++)
353 block[idct_simple_mmx_perm[i]] = block1[i];
354 } else {
355 for(i=0; i<64; i++)
356 block[i]= block1[i];
357 }
358
359 ti = gettime();
360 it1 = 0;
361 do {
362 for(it=0;it<NB_ITS_SPEED;it++) {
363 for(i=0; i<64; i++)
364 block[i]= block1[i];
365 // memcpy(block, block1, sizeof(DCTELEM) * 64);
366 // do not memcpy especially not fastmemcpy because it does movntq !!!
367 fdct_func(block);
368 }
369 it1 += NB_ITS_SPEED;
370 ti1 = gettime() - ti;
371 } while (ti1 < 1000000);
372 mmx_emms();
373
374 printf("%s %s: %0.1f kdct/s\n",
375 is_idct ? "IDCT" : "DCT",
376 name, (double)it1 * 1000.0 / (double)ti1);
377 }
378
379 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
380 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
381
382 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
383 {
384 static int init;
385 static double c8[8][8];
386 static double c4[4][4];
387 double block1[64], block2[64], block3[64];
388 double s, sum, v;
389 int i, j, k;
390
391 if (!init) {
392 init = 1;
393
394 for(i=0;i<8;i++) {
395 sum = 0;
396 for(j=0;j<8;j++) {
397 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
398 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
399 sum += c8[i][j] * c8[i][j];
400 }
401 }
402
403 for(i=0;i<4;i++) {
404 sum = 0;
405 for(j=0;j<4;j++) {
406 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
407 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
408 sum += c4[i][j] * c4[i][j];
409 }
410 }
411 }
412
413 /* butterfly */
414 s = 0.5 * sqrt(2.0);
415 for(i=0;i<4;i++) {
416 for(j=0;j<8;j++) {
417 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
418 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
419 }
420 }
421
422 /* idct8 on lines */
423 for(i=0;i<8;i++) {
424 for(j=0;j<8;j++) {
425 sum = 0;
426 for(k=0;k<8;k++)
427 sum += c8[k][j] * block1[8*i+k];
428 block2[8*i+j] = sum;
429 }
430 }
431
432 /* idct4 */
433 for(i=0;i<8;i++) {
434 for(j=0;j<4;j++) {
435 /* top */
436 sum = 0;
437 for(k=0;k<4;k++)
438 sum += c4[k][j] * block2[8*(2*k)+i];
439 block3[8*(2*j)+i] = sum;
440
441 /* bottom */
442 sum = 0;
443 for(k=0;k<4;k++)
444 sum += c4[k][j] * block2[8*(2*k+1)+i];
445 block3[8*(2*j+1)+i] = sum;
446 }
447 }
448
449 /* clamp and store the result */
450 for(i=0;i<8;i++) {
451 for(j=0;j<8;j++) {
452 v = block3[8*i+j];
453 if (v < 0)
454 v = 0;
455 else if (v > 255)
456 v = 255;
457 dest[i * linesize + j] = (int)rint(v);
458 }
459 }
460 }
461
462 static void idct248_error(const char *name,
463 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
464 {
465 int it, i, it1, ti, ti1, err_max, v;
466
467 AVLFG prng;
468
469 av_lfg_init(&prng, 1);
470
471 /* just one test to see if code is correct (precision is less
472 important here) */
473 err_max = 0;
474 for(it=0;it<NB_ITS;it++) {
475
476 /* XXX: use forward transform to generate values */
477 for(i=0;i<64;i++)
478 block1[i] = av_lfg_get(&prng) % 256 - 128;
479 block1[0] += 1024;
480
481 for(i=0; i<64; i++)
482 block[i]= block1[i];
483 idct248_ref(img_dest1, 8, block);
484
485 for(i=0; i<64; i++)
486 block[i]= block1[i];
487 idct248_put(img_dest, 8, block);
488
489 for(i=0;i<64;i++) {
490 v = abs((int)img_dest[i] - (int)img_dest1[i]);
491 if (v == 255)
492 printf("%d %d\n", img_dest[i], img_dest1[i]);
493 if (v > err_max)
494 err_max = v;
495 }
496 }
497 printf("%s %s: err_inf=%d\n",
498 1 ? "IDCT248" : "DCT248",
499 name, err_max);
500
501 ti = gettime();
502 it1 = 0;
503 do {
504 for(it=0;it<NB_ITS_SPEED;it++) {
505 for(i=0; i<64; i++)
506 block[i]= block1[i];
507 // memcpy(block, block1, sizeof(DCTELEM) * 64);
508 // do not memcpy especially not fastmemcpy because it does movntq !!!
509 idct248_put(img_dest, 8, block);
510 }
511 it1 += NB_ITS_SPEED;
512 ti1 = gettime() - ti;
513 } while (ti1 < 1000000);
514 mmx_emms();
515
516 printf("%s %s: %0.1f kdct/s\n",
517 1 ? "IDCT248" : "DCT248",
518 name, (double)it1 * 1000.0 / (double)ti1);
519 }
520
521 static void help(void)
522 {
523 printf("dct-test [-i] [<test-number>]\n"
524 "test-number 0 -> test with random matrixes\n"
525 " 1 -> test with random sparse matrixes\n"
526 " 2 -> do 3. test from mpeg4 std\n"
527 "-i test IDCT implementations\n"
528 "-4 test IDCT248 implementations\n");
529 }
530
531 int main(int argc, char **argv)
532 {
533 int test_idct = 0, test_248_dct = 0;
534 int c,i;
535 int test=1;
536 cpu_flags = av_get_cpu_flags();
537
538 ff_ref_dct_init();
539 idct_mmx_init();
540
541 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
542 for(i=0;i<MAX_NEG_CROP;i++) {
543 cropTbl[i] = 0;
544 cropTbl[i + MAX_NEG_CROP + 256] = 255;
545 }
546
547 for(;;) {
548 c = getopt(argc, argv, "ih4");
549 if (c == -1)
550 break;
551 switch(c) {
552 case 'i':
553 test_idct = 1;
554 break;
555 case '4':
556 test_248_dct = 1;
557 break;
558 default :
559 case 'h':
560 help();
561 return 0;
562 }
563 }
564
565 if(optind <argc) test= atoi(argv[optind]);
566
567 printf("ffmpeg DCT/IDCT test\n");
568
569 if (test_248_dct) {
570 idct248_error("SIMPLE-C", ff_simple_idct248_put);
571 } else {
572 for (i=0;algos[i].name;i++)
573 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
574 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);
575 }
576 }
577 return 0;
578 }