vorbisdsp: convert x86 simd functions from inline asm to yasm.
[libav.git] / libavcodec / dct-test.c
CommitLineData
04d7f601
DB
1/*
2 * (c) 2001 Fabrice Bellard
3ac35bdb 3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
04d7f601 4 *
2912e87a 5 * This file is part of Libav.
b78e7197 6 *
2912e87a 7 * Libav is free software; you can redistribute it and/or
04d7f601
DB
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
b78e7197 10 * version 2.1 of the License, or (at your option) any later version.
04d7f601 11 *
2912e87a 12 * Libav is distributed in the hope that it will be useful,
04d7f601
DB
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
2912e87a 18 * License along with Libav; if not, write to the Free Software
04d7f601
DB
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
983e3246 22/**
ba87f080 23 * @file
94f694a4 24 * DCT test (c) 2001 Fabrice Bellard
983e3246
MN
25 * Started from sample code by Juan J. Sierralta P.
26 */
27
667fb97a 28#include "config.h"
de6d9b64
FB
29#include <stdlib.h>
30#include <stdio.h>
31#include <string.h>
667fb97a 32#if HAVE_UNISTD_H
de6d9b64 33#include <unistd.h>
667fb97a 34#endif
12807c8d 35#include <math.h>
de6d9b64 36
c6c98d08 37#include "libavutil/cpu.h"
ae32e509 38#include "libavutil/common.h"
294eaa26 39#include "libavutil/lfg.h"
980f81d9 40#include "libavutil/time.h"
de6d9b64 41
86748dbc 42#include "simple_idct.h"
10ac3618 43#include "aandcttab.h"
65e4c8c9 44#include "faandct.h"
6f08c541 45#include "faanidct.h"
a6493a8f 46#include "x86/idct_xvid.h"
6a813295 47#include "dctref.h"
9e1586fc 48
434df899
MN
49#undef printf
50
3ac35bdb 51// BFIN
9686df2b
DB
52void ff_bfin_idct(DCTELEM *block);
53void ff_bfin_fdct(DCTELEM *block);
3ac35bdb
MH
54
55// ALTIVEC
07333750 56void ff_fdct_altivec(DCTELEM *block);
3ac35bdb 57
479044ce 58// ARM
0926c009
MR
59void ff_j_rev_dct_arm(DCTELEM *data);
60void ff_simple_idct_arm(DCTELEM *data);
61void ff_simple_idct_armv5te(DCTELEM *data);
479044ce
MR
62void ff_simple_idct_armv6(DCTELEM *data);
63void ff_simple_idct_neon(DCTELEM *data);
3ac35bdb 64
2a839eeb
MR
65void ff_simple_idct_axp(DCTELEM *data);
66
3ac35bdb 67struct algo {
36fa9ef3 68 const char *name;
36fa9ef3 69 void (*func)(DCTELEM *block);
36fa9ef3
MR
70 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
71 SSE2_PERM, PARTTRANS_PERM } format;
72 int mm_support;
dbf396d4 73 int nonspec;
3ac35bdb
MH
74};
75
aadd27cd
MN
76static int cpu_flags;
77
4b357756 78static const struct algo fdct_tab[] = {
74965f26 79 { "REF-DBL", ff_ref_fdct, NO_PERM },
856c8e0a 80 { "FAAN", ff_faandct, NO_PERM },
3e2efacd 81 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
0a72533e 82 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
3ac35bdb 83
17337f54 84#if HAVE_MMX_INLINE
74965f26 85 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
d8eda370 86 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
74965f26 87 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
94254fc0 88#endif
3ac35bdb 89
4b357756 90#if HAVE_ALTIVEC
07333750 91 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
4b357756
MR
92#endif
93
94#if ARCH_BFIN
74965f26 95 { "BFINfdct", ff_bfin_fdct, NO_PERM },
4b357756
MR
96#endif
97
98 { 0 }
99};
100
101static const struct algo idct_tab[] = {
74965f26
MR
102 { "FAANI", ff_faanidct, NO_PERM },
103 { "REF-DBL", ff_ref_idct, NO_PERM },
c8e1b2fb 104 { "INT", ff_j_rev_dct, MMX_PERM },
e7a972e1 105 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
4b357756 106
17337f54 107#if HAVE_MMX_INLINE
74965f26
MR
108 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
109 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
d8eda370 110 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
74965f26 111 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
3ac35bdb
MH
112#endif
113
b250f9c6 114#if ARCH_BFIN
74965f26 115 { "BFINidct", ff_bfin_idct, NO_PERM },
3ac35bdb
MH
116#endif
117
b250f9c6 118#if ARCH_ARM
74965f26
MR
119 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
120 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
4b357756 121#endif
b250f9c6 122#if HAVE_ARMV5TE
c29d49c1 123 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
479044ce 124#endif
b250f9c6 125#if HAVE_ARMV6
c29d49c1 126 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
479044ce 127#endif
b250f9c6 128#if HAVE_NEON
c29d49c1 129 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
479044ce 130#endif
479044ce 131
2a839eeb 132#if ARCH_ALPHA
74965f26 133 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
2a839eeb
MR
134#endif
135
36fa9ef3 136 { 0 }
3ac35bdb
MH
137};
138
de6d9b64 139#define AANSCALE_BITS 12
de6d9b64 140
de6d9b64
FB
141#define NB_ITS 20000
142#define NB_ITS_SPEED 50000
143
9e1586fc
FB
144static short idct_mmx_perm[64];
145
36fa9ef3
MR
146static short idct_simple_mmx_perm[64] = {
147 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
148 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
149 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
150 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
151 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
152 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
153 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
154 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
86748dbc
MN
155};
156
36fa9ef3 157static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
ad246860 158
504ffed1 159static void idct_mmx_init(void)
9e1586fc
FB
160{
161 int i;
162
163 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
164 for (i = 0; i < 64; i++) {
bb270c08 165 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
9e1586fc
FB
166 }
167}
168
c6727809 169DECLARE_ALIGNED(16, static DCTELEM, block)[64];
36fa9ef3 170DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
9e1586fc 171
ae2e8971
MR
172static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
173{
174 int i, j;
175
176 memset(block, 0, 64 * sizeof(*block));
177
178 switch (test) {
179 case 0:
180 for (i = 0; i < 64; i++)
181 block[i] = (av_lfg_get(prng) % 512) - 256;
182 if (is_idct) {
183 ff_ref_fdct(block);
184 for (i = 0; i < 64; i++)
185 block[i] >>= 3;
186 }
187 break;
188 case 1:
189 j = av_lfg_get(prng) % 10 + 1;
190 for (i = 0; i < j; i++)
191 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
192 break;
193 case 2:
194 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
195 block[63] = (block[0] & 1) ^ 1;
196 break;
197 }
198}
199
200static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
201{
202 int i;
203
204 if (perm == MMX_PERM) {
205 for (i = 0; i < 64; i++)
206 dst[idct_mmx_perm[i]] = src[i];
207 } else if (perm == MMX_SIMPLE_PERM) {
208 for (i = 0; i < 64; i++)
209 dst[idct_simple_mmx_perm[i]] = src[i];
210 } else if (perm == SSE2_PERM) {
211 for (i = 0; i < 64; i++)
212 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
213 } else if (perm == PARTTRANS_PERM) {
214 for (i = 0; i < 64; i++)
215 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
216 } else {
217 for (i = 0; i < 64; i++)
218 dst[i] = src[i];
219 }
220}
221
dbf396d4 222static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
de6d9b64 223{
74965f26 224 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
de6d9b64 225 int it, i, scale;
de6d9b64 226 int err_inf, v;
dbf396d4 227 int64_t err2, ti, ti1, it1, err_sum = 0;
36fa9ef3
MR
228 int64_t sysErr[64], sysErrMax = 0;
229 int maxout = 0;
230 int blockSumErrMax = 0, blockSumErr;
64bde197 231 AVLFG prng;
dbf396d4
MR
232 double omse, ome;
233 int spec_err;
de6d9b64 234
64bde197 235 av_lfg_init(&prng, 1);
de6d9b64
FB
236
237 err_inf = 0;
238 err2 = 0;
36fa9ef3
MR
239 for (i = 0; i < 64; i++)
240 sysErr[i] = 0;
241 for (it = 0; it < NB_ITS; it++) {
ae2e8971
MR
242 init_block(block1, test, is_idct, &prng);
243 permute(block, block1, dct->format);
9e1586fc 244
4f905a65 245 dct->func(block);
db7d8fb4 246 emms_c();
9e1586fc 247
4f905a65 248 if (dct->format == SCALE_PERM) {
36fa9ef3
MR
249 for (i = 0; i < 64; i++) {
250 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
251 block[i] = (block[i] * scale) >> AANSCALE_BITS;
86748dbc
MN
252 }
253 }
254
74965f26 255 ref(block1);
de6d9b64 256
36fa9ef3
MR
257 blockSumErr = 0;
258 for (i = 0; i < 64; i++) {
dbf396d4
MR
259 int err = block[i] - block1[i];
260 err_sum += err;
261 v = abs(err);
de6d9b64
FB
262 if (v > err_inf)
263 err_inf = v;
264 err2 += v * v;
bb270c08
DB
265 sysErr[i] += block[i] - block1[i];
266 blockSumErr += v;
36fa9ef3
MR
267 if (abs(block[i]) > maxout)
268 maxout = abs(block[i]);
de6d9b64 269 }
36fa9ef3
MR
270 if (blockSumErrMax < blockSumErr)
271 blockSumErrMax = blockSumErr;
86748dbc 272 }
36fa9ef3
MR
273 for (i = 0; i < 64; i++)
274 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
115329f1 275
36fa9ef3
MR
276 for (i = 0; i < 64; i++) {
277 if (i % 8 == 0)
278 printf("\n");
279 printf("%7d ", (int) sysErr[i]);
de6d9b64 280 }
86748dbc 281 printf("\n");
115329f1 282
dbf396d4
MR
283 omse = (double) err2 / NB_ITS / 64;
284 ome = (double) err_sum / NB_ITS / 64;
285
286 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
287
288 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
cf2b4f88 289 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
dbf396d4 290 omse, ome, (double) sysErrMax / NB_ITS,
36fa9ef3 291 maxout, blockSumErrMax);
e6ff0648 292
dbf396d4
MR
293 if (spec_err && !dct->nonspec)
294 return 1;
295
7fd2c138 296 if (!speed)
dbf396d4 297 return 0;
7fd2c138 298
de6d9b64 299 /* speed test */
ae2e8971
MR
300 init_block(block, test, is_idct, &prng);
301 permute(block1, block, dct->format);
9e1586fc 302
980f81d9 303 ti = av_gettime();
de6d9b64
FB
304 it1 = 0;
305 do {
36fa9ef3 306 for (it = 0; it < NB_ITS_SPEED; it++) {
ae2e8971 307 memcpy(block, block1, sizeof(block));
4f905a65 308 dct->func(block);
de6d9b64
FB
309 }
310 it1 += NB_ITS_SPEED;
980f81d9 311 ti1 = av_gettime() - ti;
de6d9b64 312 } while (ti1 < 1000000);
db7d8fb4 313 emms_c();
de6d9b64 314
cf2b4f88 315 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
36fa9ef3 316 (double) it1 * 1000.0 / (double) ti1);
dbf396d4
MR
317
318 return 0;
de6d9b64
FB
319}
320
c6727809
MR
321DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
322DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
a46a3ce4 323
504ffed1 324static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
a46a3ce4
FB
325{
326 static int init;
327 static double c8[8][8];
328 static double c4[4][4];
329 double block1[64], block2[64], block3[64];
330 double s, sum, v;
331 int i, j, k;
332
333 if (!init) {
334 init = 1;
335
36fa9ef3 336 for (i = 0; i < 8; i++) {
a46a3ce4 337 sum = 0;
36fa9ef3
MR
338 for (j = 0; j < 8; j++) {
339 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
a46a3ce4
FB
340 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
341 sum += c8[i][j] * c8[i][j];
342 }
343 }
115329f1 344
36fa9ef3 345 for (i = 0; i < 4; i++) {
a46a3ce4 346 sum = 0;
36fa9ef3
MR
347 for (j = 0; j < 4; j++) {
348 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
a46a3ce4
FB
349 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
350 sum += c4[i][j] * c4[i][j];
351 }
352 }
353 }
354
355 /* butterfly */
652f0197 356 s = 0.5 * sqrt(2.0);
36fa9ef3
MR
357 for (i = 0; i < 4; i++) {
358 for (j = 0; j < 8; j++) {
359 block1[8 * (2 * i) + j] =
360 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
361 block1[8 * (2 * i + 1) + j] =
362 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
a46a3ce4
FB
363 }
364 }
365
366 /* idct8 on lines */
36fa9ef3
MR
367 for (i = 0; i < 8; i++) {
368 for (j = 0; j < 8; j++) {
a46a3ce4 369 sum = 0;
36fa9ef3
MR
370 for (k = 0; k < 8; k++)
371 sum += c8[k][j] * block1[8 * i + k];
372 block2[8 * i + j] = sum;
a46a3ce4
FB
373 }
374 }
375
376 /* idct4 */
36fa9ef3
MR
377 for (i = 0; i < 8; i++) {
378 for (j = 0; j < 4; j++) {
a46a3ce4
FB
379 /* top */
380 sum = 0;
36fa9ef3
MR
381 for (k = 0; k < 4; k++)
382 sum += c4[k][j] * block2[8 * (2 * k) + i];
383 block3[8 * (2 * j) + i] = sum;
a46a3ce4
FB
384
385 /* bottom */
386 sum = 0;
36fa9ef3
MR
387 for (k = 0; k < 4; k++)
388 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
389 block3[8 * (2 * j + 1) + i] = sum;
a46a3ce4
FB
390 }
391 }
392
393 /* clamp and store the result */
36fa9ef3
MR
394 for (i = 0; i < 8; i++) {
395 for (j = 0; j < 8; j++) {
396 v = block3[8 * i + j];
397 if (v < 0) v = 0;
398 else if (v > 255) v = 255;
399 dest[i * linesize + j] = (int) rint(v);
a46a3ce4
FB
400 }
401 }
402}
403
504ffed1 404static void idct248_error(const char *name,
36fa9ef3 405 void (*idct248_put)(uint8_t *dest, int line_size,
7fd2c138
MR
406 int16_t *block),
407 int speed)
a46a3ce4
FB
408{
409 int it, i, it1, ti, ti1, err_max, v;
64bde197 410 AVLFG prng;
294eaa26 411
64bde197 412 av_lfg_init(&prng, 1);
115329f1 413
a46a3ce4
FB
414 /* just one test to see if code is correct (precision is less
415 important here) */
416 err_max = 0;
36fa9ef3 417 for (it = 0; it < NB_ITS; it++) {
652f0197 418 /* XXX: use forward transform to generate values */
36fa9ef3 419 for (i = 0; i < 64; i++)
64bde197 420 block1[i] = av_lfg_get(&prng) % 256 - 128;
652f0197
FB
421 block1[0] += 1024;
422
36fa9ef3
MR
423 for (i = 0; i < 64; i++)
424 block[i] = block1[i];
a46a3ce4 425 idct248_ref(img_dest1, 8, block);
115329f1 426
36fa9ef3
MR
427 for (i = 0; i < 64; i++)
428 block[i] = block1[i];
652f0197 429 idct248_put(img_dest, 8, block);
115329f1 430
36fa9ef3
MR
431 for (i = 0; i < 64; i++) {
432 v = abs((int) img_dest[i] - (int) img_dest1[i]);
652f0197
FB
433 if (v == 255)
434 printf("%d %d\n", img_dest[i], img_dest1[i]);
435 if (v > err_max)
436 err_max = v;
437 }
a46a3ce4 438 }
36fa9ef3 439 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
a46a3ce4 440
7fd2c138
MR
441 if (!speed)
442 return;
443
980f81d9 444 ti = av_gettime();
a46a3ce4
FB
445 it1 = 0;
446 do {
36fa9ef3
MR
447 for (it = 0; it < NB_ITS_SPEED; it++) {
448 for (i = 0; i < 64; i++)
449 block[i] = block1[i];
a46a3ce4
FB
450 idct248_put(img_dest, 8, block);
451 }
452 it1 += NB_ITS_SPEED;
980f81d9 453 ti1 = av_gettime() - ti;
a46a3ce4 454 } while (ti1 < 1000000);
db7d8fb4 455 emms_c();
a46a3ce4 456
36fa9ef3
MR
457 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
458 (double) it1 * 1000.0 / (double) ti1);
a46a3ce4
FB
459}
460
504ffed1 461static void help(void)
9e1586fc 462{
86748dbc
MN
463 printf("dct-test [-i] [<test-number>]\n"
464 "test-number 0 -> test with random matrixes\n"
465 " 1 -> test with random sparse matrixes\n"
466 " 2 -> do 3. test from mpeg4 std\n"
a46a3ce4 467 "-i test IDCT implementations\n"
7fd2c138
MR
468 "-4 test IDCT248 implementations\n"
469 "-t speed test\n");
9e1586fc
FB
470}
471
667fb97a
RB
472#if !HAVE_GETOPT
473#include "compat/getopt.c"
474#endif
475
de6d9b64
FB
476int main(int argc, char **argv)
477{
a46a3ce4 478 int test_idct = 0, test_248_dct = 0;
36fa9ef3
MR
479 int c, i;
480 int test = 1;
7fd2c138 481 int speed = 0;
dbf396d4 482 int err = 0;
36fa9ef3 483
c6c98d08 484 cpu_flags = av_get_cpu_flags();
9e1586fc 485
0de74546 486 ff_ref_dct_init();
9e1586fc 487 idct_mmx_init();
f67a10cd 488
36fa9ef3 489 for (;;) {
7fd2c138 490 c = getopt(argc, argv, "ih4t");
9e1586fc
FB
491 if (c == -1)
492 break;
36fa9ef3 493 switch (c) {
9e1586fc
FB
494 case 'i':
495 test_idct = 1;
496 break;
a46a3ce4
FB
497 case '4':
498 test_248_dct = 1;
499 break;
7fd2c138
MR
500 case 't':
501 speed = 1;
502 break;
36fa9ef3 503 default:
9e1586fc
FB
504 case 'h':
505 help();
c6bdc908 506 return 0;
9e1586fc
FB
507 }
508 }
115329f1 509
36fa9ef3
MR
510 if (optind < argc)
511 test = atoi(argv[optind]);
115329f1 512
f36b3902 513 printf("Libav DCT/IDCT test\n");
9e1586fc 514
a46a3ce4 515 if (test_248_dct) {
7fd2c138 516 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
9e1586fc 517 } else {
4b357756 518 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
36fa9ef3 519 for (i = 0; algos[i].name; i++)
4b357756 520 if (!(~cpu_flags & algos[i].mm_support)) {
dbf396d4 521 err |= dct_error(&algos[i], test, test_idct, speed);
36fa9ef3 522 }
9e1586fc 523 }
dbf396d4
MR
524
525 return err;
de6d9b64 526}