(m)jpeg pad/flush with 1 instead of 0, fix by Rik Snel <rsnel@cube.dyndns.org>
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19#include <stdlib.h>
20#include <stdio.h>
21#include "avcodec.h"
22#include "dsputil.h"
d962f6fd 23#include "simple_idct.h"
de6d9b64 24
4af7bcc1 25void (*ff_idct)(DCTELEM *block);
de6d9b64
FB
26void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
27void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
28void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
29
30op_pixels_abs_func pix_abs16x16;
31op_pixels_abs_func pix_abs16x16_x2;
32op_pixels_abs_func pix_abs16x16_y2;
33op_pixels_abs_func pix_abs16x16_xy2;
34
0cfa9713 35UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
36UINT32 squareTbl[512];
37
e0eac44e
FB
38extern UINT16 default_intra_matrix[64];
39extern UINT16 default_non_intra_matrix[64];
40
41UINT8 zigzag_direct[64] = {
42 0, 1, 8, 16, 9, 2, 3, 10,
43 17, 24, 32, 25, 18, 11, 4, 5,
44 12, 19, 26, 33, 40, 48, 41, 34,
45 27, 20, 13, 6, 7, 14, 21, 28,
46 35, 42, 49, 56, 57, 50, 43, 36,
47 29, 22, 15, 23, 30, 37, 44, 51,
48 58, 59, 52, 45, 38, 31, 39, 46,
49 53, 60, 61, 54, 47, 55, 62, 63
50};
51
52UINT8 ff_alternate_horizontal_scan[64] = {
53 0, 1, 2, 3, 8, 9, 16, 17,
54 10, 11, 4, 5, 6, 7, 15, 14,
55 13, 12, 19, 18, 24, 25, 32, 33,
56 26, 27, 20, 21, 22, 23, 28, 29,
57 30, 31, 34, 35, 40, 41, 48, 49,
58 42, 43, 36, 37, 38, 39, 44, 45,
59 46, 47, 50, 51, 56, 57, 58, 59,
60 52, 53, 54, 55, 60, 61, 62, 63,
61};
62
63UINT8 ff_alternate_vertical_scan[64] = {
64 0, 8, 16, 24, 1, 9, 2, 10,
65 17, 25, 32, 40, 48, 56, 57, 49,
66 41, 33, 26, 18, 3, 11, 4, 12,
67 19, 27, 34, 42, 50, 58, 35, 43,
68 51, 59, 20, 28, 5, 13, 6, 14,
69 21, 29, 36, 44, 52, 60, 37, 45,
70 53, 61, 22, 30, 7, 15, 23, 31,
71 38, 46, 54, 62, 39, 47, 55, 63,
72};
73
0a8d8945 74/* Input permutation for the simple_idct_mmx */
5a240838 75static UINT8 simple_mmx_permutation[64]={
0a8d8945
MN
76 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
77 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
78 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
79 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
80 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
81 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
82 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
83 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
5a240838
MN
84};
85
badaf88e
MN
86/* used to skip zeros at the end */
87UINT8 zigzag_end[64];
88
5a240838
MN
89UINT8 permutation[64];
90//UINT8 invPermutation[64];
91
badaf88e
MN
92static void build_zigzag_end()
93{
94 int lastIndex;
95 int lastIndexAfterPerm=0;
96 for(lastIndex=0; lastIndex<64; lastIndex++)
97 {
98 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
99 lastIndexAfterPerm= zigzag_direct[lastIndex];
100 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
101 }
102}
103
de6d9b64
FB
104void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
105{
106 DCTELEM *p;
107 const UINT8 *pix;
108 int i;
109
110 /* read the pixels */
111 p = block;
112 pix = pixels;
113 for(i=0;i<8;i++) {
114 p[0] = pix[0];
115 p[1] = pix[1];
116 p[2] = pix[2];
117 p[3] = pix[3];
118 p[4] = pix[4];
119 p[5] = pix[5];
120 p[6] = pix[6];
121 p[7] = pix[7];
122 pix += line_size;
123 p += 8;
124 }
125}
126
127void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
128{
129 const DCTELEM *p;
130 UINT8 *pix;
131 int i;
132 UINT8 *cm = cropTbl + MAX_NEG_CROP;
133
134 /* read the pixels */
135 p = block;
136 pix = pixels;
137 for(i=0;i<8;i++) {
138 pix[0] = cm[p[0]];
139 pix[1] = cm[p[1]];
140 pix[2] = cm[p[2]];
141 pix[3] = cm[p[3]];
142 pix[4] = cm[p[4]];
143 pix[5] = cm[p[5]];
144 pix[6] = cm[p[6]];
145 pix[7] = cm[p[7]];
146 pix += line_size;
147 p += 8;
148 }
149}
150
151void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
152{
153 const DCTELEM *p;
154 UINT8 *pix;
155 int i;
156 UINT8 *cm = cropTbl + MAX_NEG_CROP;
157
158 /* read the pixels */
159 p = block;
160 pix = pixels;
161 for(i=0;i<8;i++) {
162 pix[0] = cm[pix[0] + p[0]];
163 pix[1] = cm[pix[1] + p[1]];
164 pix[2] = cm[pix[2] + p[2]];
165 pix[3] = cm[pix[3] + p[3]];
166 pix[4] = cm[pix[4] + p[4]];
167 pix[5] = cm[pix[5] + p[5]];
168 pix[6] = cm[pix[6] + p[6]];
169 pix[7] = cm[pix[7] + p[7]];
170 pix += line_size;
171 p += 8;
172 }
173}
174
175#define PIXOP(BTYPE, OPNAME, OP, INCR) \
176 \
177static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
178{ \
179 BTYPE *p; \
180 const UINT8 *pix; \
181 \
182 p = block; \
183 pix = pixels; \
184 do { \
185 OP(p[0], pix[0]); \
186 OP(p[1], pix[1]); \
187 OP(p[2], pix[2]); \
188 OP(p[3], pix[3]); \
189 OP(p[4], pix[4]); \
190 OP(p[5], pix[5]); \
191 OP(p[6], pix[6]); \
192 OP(p[7], pix[7]); \
193 pix += line_size; \
194 p += INCR; \
195 } while (--h);; \
196} \
197 \
198static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
199{ \
200 BTYPE *p; \
201 const UINT8 *pix; \
202 \
203 p = block; \
204 pix = pixels; \
205 do { \
206 OP(p[0], avg2(pix[0], pix[1])); \
207 OP(p[1], avg2(pix[1], pix[2])); \
208 OP(p[2], avg2(pix[2], pix[3])); \
209 OP(p[3], avg2(pix[3], pix[4])); \
210 OP(p[4], avg2(pix[4], pix[5])); \
211 OP(p[5], avg2(pix[5], pix[6])); \
212 OP(p[6], avg2(pix[6], pix[7])); \
213 OP(p[7], avg2(pix[7], pix[8])); \
214 pix += line_size; \
215 p += INCR; \
216 } while (--h); \
217} \
218 \
219static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
220{ \
221 BTYPE *p; \
222 const UINT8 *pix; \
223 const UINT8 *pix1; \
224 \
225 p = block; \
226 pix = pixels; \
227 pix1 = pixels + line_size; \
228 do { \
229 OP(p[0], avg2(pix[0], pix1[0])); \
230 OP(p[1], avg2(pix[1], pix1[1])); \
231 OP(p[2], avg2(pix[2], pix1[2])); \
232 OP(p[3], avg2(pix[3], pix1[3])); \
233 OP(p[4], avg2(pix[4], pix1[4])); \
234 OP(p[5], avg2(pix[5], pix1[5])); \
235 OP(p[6], avg2(pix[6], pix1[6])); \
236 OP(p[7], avg2(pix[7], pix1[7])); \
237 pix += line_size; \
238 pix1 += line_size; \
239 p += INCR; \
240 } while(--h); \
241} \
242 \
243static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
244{ \
245 BTYPE *p; \
246 const UINT8 *pix; \
247 const UINT8 *pix1; \
248 \
249 p = block; \
250 pix = pixels; \
251 pix1 = pixels + line_size; \
252 do { \
253 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
254 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
255 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
256 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
257 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
258 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
259 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
260 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
261 pix += line_size; \
262 pix1 += line_size; \
263 p += INCR; \
264 } while(--h); \
265} \
266 \
267void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
268 OPNAME ## _pixels, \
269 OPNAME ## _pixels_x2, \
270 OPNAME ## _pixels_y2, \
271 OPNAME ## _pixels_xy2, \
272};
273
274
275/* rounding primitives */
276#define avg2(a,b) ((a+b+1)>>1)
277#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
278
279#define op_put(a, b) a = b
280#define op_avg(a, b) a = avg2(a, b)
281#define op_sub(a, b) a -= b
282
283PIXOP(UINT8, put, op_put, line_size)
284PIXOP(UINT8, avg, op_avg, line_size)
285
286PIXOP(DCTELEM, sub, op_sub, 8)
287
288/* not rounding primitives */
289#undef avg2
290#undef avg4
291#define avg2(a,b) ((a+b)>>1)
292#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
293
294PIXOP(UINT8, put_no_rnd, op_put, line_size)
295PIXOP(UINT8, avg_no_rnd, op_avg, line_size)
296
297/* motion estimation */
298
299#undef avg2
300#undef avg4
301#define avg2(a,b) ((a+b+1)>>1)
302#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
303
304int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
305{
306 int s, i;
307
308 s = 0;
309 for(i=0;i<h;i++) {
310 s += abs(pix1[0] - pix2[0]);
311 s += abs(pix1[1] - pix2[1]);
312 s += abs(pix1[2] - pix2[2]);
313 s += abs(pix1[3] - pix2[3]);
314 s += abs(pix1[4] - pix2[4]);
315 s += abs(pix1[5] - pix2[5]);
316 s += abs(pix1[6] - pix2[6]);
317 s += abs(pix1[7] - pix2[7]);
318 s += abs(pix1[8] - pix2[8]);
319 s += abs(pix1[9] - pix2[9]);
320 s += abs(pix1[10] - pix2[10]);
321 s += abs(pix1[11] - pix2[11]);
322 s += abs(pix1[12] - pix2[12]);
323 s += abs(pix1[13] - pix2[13]);
324 s += abs(pix1[14] - pix2[14]);
325 s += abs(pix1[15] - pix2[15]);
326 pix1 += line_size;
327 pix2 += line_size;
328 }
329 return s;
330}
331
332int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
333{
334 int s, i;
335
336 s = 0;
337 for(i=0;i<h;i++) {
338 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
339 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
340 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
341 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
342 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
343 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
344 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
345 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
346 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
347 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
348 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
349 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
350 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
351 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
352 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
353 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
354 pix1 += line_size;
355 pix2 += line_size;
356 }
357 return s;
358}
359
360int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
361{
362 int s, i;
363 UINT8 *pix3 = pix2 + line_size;
364
365 s = 0;
366 for(i=0;i<h;i++) {
367 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
368 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
369 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
370 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
371 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
372 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
373 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
374 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
375 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
376 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
377 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
378 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
379 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
380 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
381 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
382 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
383 pix1 += line_size;
384 pix2 += line_size;
385 pix3 += line_size;
386 }
387 return s;
388}
389
390int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size, int h)
391{
392 int s, i;
393 UINT8 *pix3 = pix2 + line_size;
394
395 s = 0;
396 for(i=0;i<h;i++) {
397 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
398 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
399 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
400 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
401 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
402 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
403 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
404 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
405 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
406 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
407 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
408 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
409 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
410 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
411 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
412 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
413 pix1 += line_size;
414 pix2 += line_size;
415 pix3 += line_size;
416 }
417 return s;
418}
419
e0eac44e
FB
420/* permute block according so that it corresponds to the MMX idct
421 order */
d962f6fd 422#ifdef SIMPLE_IDCT
5a240838 423 /* general permutation, but perhaps slightly slower */
d962f6fd
A
424void block_permute(INT16 *block)
425{
426 int i;
427 INT16 temp[64];
428
d962f6fd
A
429 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
430
431 for(i=0; i<64; i++) block[i] = temp[i];
d962f6fd 432}
d962f6fd
A
433#else
434
e0eac44e 435void block_permute(INT16 *block)
de6d9b64 436{
e0eac44e 437 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
de6d9b64
FB
438 int i;
439
e0eac44e
FB
440 for(i=0;i<8;i++) {
441 tmp1 = block[1];
442 tmp2 = block[2];
443 tmp3 = block[3];
444 tmp4 = block[4];
445 tmp5 = block[5];
446 tmp6 = block[6];
447 block[1] = tmp2;
448 block[2] = tmp4;
449 block[3] = tmp6;
450 block[4] = tmp1;
451 block[5] = tmp3;
452 block[6] = tmp5;
453 block += 8;
454 }
455}
d962f6fd 456#endif
e0eac44e
FB
457
458void dsputil_init(void)
459{
460 int i, j;
c34270f5 461 int use_permuted_idct;
e0eac44e 462
de6d9b64
FB
463 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
464 for(i=0;i<MAX_NEG_CROP;i++) {
465 cropTbl[i] = 0;
466 cropTbl[i + MAX_NEG_CROP + 256] = 255;
467 }
468
469 for(i=0;i<512;i++) {
470 squareTbl[i] = (i - 256) * (i - 256);
471 }
472
d962f6fd
A
473#ifdef SIMPLE_IDCT
474 ff_idct = simple_idct;
475#else
4af7bcc1 476 ff_idct = j_rev_dct;
d962f6fd 477#endif
de6d9b64
FB
478 get_pixels = get_pixels_c;
479 put_pixels_clamped = put_pixels_clamped_c;
480 add_pixels_clamped = add_pixels_clamped_c;
481
482 pix_abs16x16 = pix_abs16x16_c;
483 pix_abs16x16_x2 = pix_abs16x16_x2_c;
484 pix_abs16x16_y2 = pix_abs16x16_y2_c;
485 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
486 av_fdct = jpeg_fdct_ifast;
487
c34270f5 488 use_permuted_idct = 1;
e0eac44e 489
980fc7b8 490#ifdef HAVE_MMX
de6d9b64
FB
491 dsputil_init_mmx();
492#endif
3d03c0a2
FB
493#ifdef ARCH_ARMV4L
494 dsputil_init_armv4l();
495#endif
c34270f5
FB
496#ifdef HAVE_MLIB
497 dsputil_init_mlib();
498 use_permuted_idct = 0;
499#endif
1e98dffb
NK
500#ifdef ARCH_ALPHA
501 dsputil_init_alpha();
502 use_permuted_idct = 0;
503#endif
c34270f5 504
d962f6fd
A
505#ifdef SIMPLE_IDCT
506 if(ff_idct == simple_idct) use_permuted_idct=0;
507#endif
508
5a240838
MN
509 if(use_permuted_idct)
510#ifdef SIMPLE_IDCT
511 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
512#else
513 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
514#endif
515 else
516 for(i=0; i<64; i++) permutation[i]=i;
517
c34270f5
FB
518 if (use_permuted_idct) {
519 /* permute for IDCT */
520 for(i=0;i<64;i++) {
521 j = zigzag_direct[i];
522 zigzag_direct[i] = block_permute_op(j);
523 j = ff_alternate_horizontal_scan[i];
524 ff_alternate_horizontal_scan[i] = block_permute_op(j);
525 j = ff_alternate_vertical_scan[i];
526 ff_alternate_vertical_scan[i] = block_permute_op(j);
527 }
528 block_permute(default_intra_matrix);
529 block_permute(default_non_intra_matrix);
530 }
badaf88e
MN
531
532 build_zigzag_end();
de6d9b64 533}