removed unused stuff - added dsputil_set_bit_exact() support for easier testing
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 *
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 */
21 #include "avcodec.h"
22 #include "dsputil.h"
23 #include "simple_idct.h"
24
25 void (*ff_idct)(DCTELEM *block);
26 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
27 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
28 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
29 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
30 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
31 void (*clear_blocks)(DCTELEM *blocks);
32
33 op_pixels_abs_func pix_abs16x16;
34 op_pixels_abs_func pix_abs16x16_x2;
35 op_pixels_abs_func pix_abs16x16_y2;
36 op_pixels_abs_func pix_abs16x16_xy2;
37
38 op_pixels_abs_func pix_abs8x8;
39 op_pixels_abs_func pix_abs8x8_x2;
40 op_pixels_abs_func pix_abs8x8_y2;
41 op_pixels_abs_func pix_abs8x8_xy2;
42
43 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
44 UINT32 squareTbl[512];
45
46 extern UINT16 default_intra_matrix[64];
47 extern UINT16 default_non_intra_matrix[64];
48 extern UINT16 ff_mpeg4_default_intra_matrix[64];
49 extern UINT16 ff_mpeg4_default_non_intra_matrix[64];
50
51 UINT8 zigzag_direct[64] = {
52 0, 1, 8, 16, 9, 2, 3, 10,
53 17, 24, 32, 25, 18, 11, 4, 5,
54 12, 19, 26, 33, 40, 48, 41, 34,
55 27, 20, 13, 6, 7, 14, 21, 28,
56 35, 42, 49, 56, 57, 50, 43, 36,
57 29, 22, 15, 23, 30, 37, 44, 51,
58 58, 59, 52, 45, 38, 31, 39, 46,
59 53, 60, 61, 54, 47, 55, 62, 63
60 };
61
62 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
63 UINT16 __align8 inv_zigzag_direct16[64];
64
65 /* not permutated zigzag_direct for MMX quantizer */
66 UINT8 zigzag_direct_noperm[64];
67
68 UINT8 ff_alternate_horizontal_scan[64] = {
69 0, 1, 2, 3, 8, 9, 16, 17,
70 10, 11, 4, 5, 6, 7, 15, 14,
71 13, 12, 19, 18, 24, 25, 32, 33,
72 26, 27, 20, 21, 22, 23, 28, 29,
73 30, 31, 34, 35, 40, 41, 48, 49,
74 42, 43, 36, 37, 38, 39, 44, 45,
75 46, 47, 50, 51, 56, 57, 58, 59,
76 52, 53, 54, 55, 60, 61, 62, 63,
77 };
78
79 UINT8 ff_alternate_vertical_scan[64] = {
80 0, 8, 16, 24, 1, 9, 2, 10,
81 17, 25, 32, 40, 48, 56, 57, 49,
82 41, 33, 26, 18, 3, 11, 4, 12,
83 19, 27, 34, 42, 50, 58, 35, 43,
84 51, 59, 20, 28, 5, 13, 6, 14,
85 21, 29, 36, 44, 52, 60, 37, 45,
86 53, 61, 22, 30, 7, 15, 23, 31,
87 38, 46, 54, 62, 39, 47, 55, 63,
88 };
89
90 #ifdef SIMPLE_IDCT
91
92 /* Input permutation for the simple_idct_mmx */
93 static UINT8 simple_mmx_permutation[64]={
94 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
95 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
96 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
97 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
98 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
99 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
100 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
101 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
102 };
103 #endif
104
105 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
106 UINT32 inverse[256]={
107 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
108 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
109 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
110 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
111 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
112 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
113 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
114 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
115 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
116 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
117 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
118 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
119 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
120 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
121 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
122 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
123 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
124 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
125 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
126 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
127 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
128 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
129 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
130 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
131 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
132 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
133 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
134 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
135 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
136 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
137 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
138 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
139 };
140
141 /* used to skip zeros at the end */
142 UINT8 zigzag_end[64];
143
144 UINT8 permutation[64];
145 //UINT8 invPermutation[64];
146
147 static void build_zigzag_end()
148 {
149 int lastIndex;
150 int lastIndexAfterPerm=0;
151 for(lastIndex=0; lastIndex<64; lastIndex++)
152 {
153 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
154 lastIndexAfterPerm= zigzag_direct[lastIndex];
155 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
156 }
157 }
158
159 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
160 {
161 DCTELEM *p;
162 const UINT8 *pix;
163 int i;
164
165 /* read the pixels */
166 p = block;
167 pix = pixels;
168 for(i=0;i<8;i++) {
169 p[0] = pix[0];
170 p[1] = pix[1];
171 p[2] = pix[2];
172 p[3] = pix[3];
173 p[4] = pix[4];
174 p[5] = pix[5];
175 p[6] = pix[6];
176 p[7] = pix[7];
177 pix += line_size;
178 p += 8;
179 }
180 }
181
182 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
183 DCTELEM *p;
184 int i;
185
186 /* read the pixels */
187 p = block;
188 for(i=0;i<8;i++) {
189 p[0] = s1[0] - s2[0];
190 p[1] = s1[1] - s2[1];
191 p[2] = s1[2] - s2[2];
192 p[3] = s1[3] - s2[3];
193 p[4] = s1[4] - s2[4];
194 p[5] = s1[5] - s2[5];
195 p[6] = s1[6] - s2[6];
196 p[7] = s1[7] - s2[7];
197 s1 += stride;
198 s2 += stride;
199 p += 8;
200 }
201 }
202
203
204 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
205 {
206 const DCTELEM *p;
207 UINT8 *pix;
208 int i;
209 UINT8 *cm = cropTbl + MAX_NEG_CROP;
210
211 /* read the pixels */
212 p = block;
213 pix = pixels;
214 for(i=0;i<8;i++) {
215 pix[0] = cm[p[0]];
216 pix[1] = cm[p[1]];
217 pix[2] = cm[p[2]];
218 pix[3] = cm[p[3]];
219 pix[4] = cm[p[4]];
220 pix[5] = cm[p[5]];
221 pix[6] = cm[p[6]];
222 pix[7] = cm[p[7]];
223 pix += line_size;
224 p += 8;
225 }
226 }
227
228 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
229 {
230 const DCTELEM *p;
231 UINT8 *pix;
232 int i;
233 UINT8 *cm = cropTbl + MAX_NEG_CROP;
234
235 /* read the pixels */
236 p = block;
237 pix = pixels;
238 for(i=0;i<8;i++) {
239 pix[0] = cm[pix[0] + p[0]];
240 pix[1] = cm[pix[1] + p[1]];
241 pix[2] = cm[pix[2] + p[2]];
242 pix[3] = cm[pix[3] + p[3]];
243 pix[4] = cm[pix[4] + p[4]];
244 pix[5] = cm[pix[5] + p[5]];
245 pix[6] = cm[pix[6] + p[6]];
246 pix[7] = cm[pix[7] + p[7]];
247 pix += line_size;
248 p += 8;
249 }
250 }
251
252 #ifdef __GNUC__
253
254 struct unaligned_64 { uint64_t l; } __attribute__((packed));
255 struct unaligned_32 { uint32_t l; } __attribute__((packed));
256
257 #define LD32(a) (((const struct unaligned_32 *) (a))->l)
258 #define LD64(a) (((const struct unaligned_64 *) (a))->l)
259
260 #else /* __GNUC__ */
261
262 #define LD32(a) (*((uint32_t*)(a)))
263 #define LD64(a) (*((uint64_t*)(a)))
264
265 #endif /* !__GNUC__ */
266
267 #if 0
268
269 #define PIXOP2(OPNAME, OP) \
270 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
271 {\
272 int i;\
273 for(i=0; i<h; i++){\
274 OP(*((uint64_t*)block), LD64(pixels));\
275 pixels+=line_size;\
276 block +=line_size;\
277 }\
278 }\
279 \
280 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
281 {\
282 int i;\
283 for(i=0; i<h; i++){\
284 const uint64_t a= LD64(pixels );\
285 const uint64_t b= LD64(pixels+1);\
286 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
287 pixels+=line_size;\
288 block +=line_size;\
289 }\
290 }\
291 \
292 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
293 {\
294 int i;\
295 for(i=0; i<h; i++){\
296 const uint64_t a= LD64(pixels );\
297 const uint64_t b= LD64(pixels+1);\
298 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
299 pixels+=line_size;\
300 block +=line_size;\
301 }\
302 }\
303 \
304 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
305 {\
306 int i;\
307 for(i=0; i<h; i++){\
308 const uint64_t a= LD64(pixels );\
309 const uint64_t b= LD64(pixels+line_size);\
310 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
311 pixels+=line_size;\
312 block +=line_size;\
313 }\
314 }\
315 \
316 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
317 {\
318 int i;\
319 for(i=0; i<h; i++){\
320 const uint64_t a= LD64(pixels );\
321 const uint64_t b= LD64(pixels+line_size);\
322 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
323 pixels+=line_size;\
324 block +=line_size;\
325 }\
326 }\
327 \
328 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
329 {\
330 int i;\
331 const uint64_t a= LD64(pixels );\
332 const uint64_t b= LD64(pixels+1);\
333 uint64_t l0= (a&0x0303030303030303ULL)\
334 + (b&0x0303030303030303ULL)\
335 + 0x0202020202020202ULL;\
336 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
337 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
338 uint64_t l1,h1;\
339 \
340 pixels+=line_size;\
341 for(i=0; i<h; i+=2){\
342 uint64_t a= LD64(pixels );\
343 uint64_t b= LD64(pixels+1);\
344 l1= (a&0x0303030303030303ULL)\
345 + (b&0x0303030303030303ULL);\
346 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
347 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
348 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
349 pixels+=line_size;\
350 block +=line_size;\
351 a= LD64(pixels );\
352 b= LD64(pixels+1);\
353 l0= (a&0x0303030303030303ULL)\
354 + (b&0x0303030303030303ULL)\
355 + 0x0202020202020202ULL;\
356 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
357 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
358 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
359 pixels+=line_size;\
360 block +=line_size;\
361 }\
362 }\
363 \
364 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
365 {\
366 int i;\
367 const uint64_t a= LD64(pixels );\
368 const uint64_t b= LD64(pixels+1);\
369 uint64_t l0= (a&0x0303030303030303ULL)\
370 + (b&0x0303030303030303ULL)\
371 + 0x0101010101010101ULL;\
372 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
373 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
374 uint64_t l1,h1;\
375 \
376 pixels+=line_size;\
377 for(i=0; i<h; i+=2){\
378 uint64_t a= LD64(pixels );\
379 uint64_t b= LD64(pixels+1);\
380 l1= (a&0x0303030303030303ULL)\
381 + (b&0x0303030303030303ULL);\
382 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
383 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
384 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
385 pixels+=line_size;\
386 block +=line_size;\
387 a= LD64(pixels );\
388 b= LD64(pixels+1);\
389 l0= (a&0x0303030303030303ULL)\
390 + (b&0x0303030303030303ULL)\
391 + 0x0101010101010101ULL;\
392 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
393 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
394 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
395 pixels+=line_size;\
396 block +=line_size;\
397 }\
398 }\
399 \
400 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
401 OPNAME ## _pixels,\
402 OPNAME ## _pixels_x2,\
403 OPNAME ## _pixels_y2,\
404 OPNAME ## _pixels_xy2,\
405 };\
406 \
407 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
408 OPNAME ## _pixels,\
409 OPNAME ## _no_rnd_pixels_x2,\
410 OPNAME ## _no_rnd_pixels_y2,\
411 OPNAME ## _no_rnd_pixels_xy2,\
412 };
413
414 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
415 #else // 64 bit variant
416
417 #define PIXOP2(OPNAME, OP) \
418 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
419 {\
420 int i;\
421 for(i=0; i<h; i++){\
422 OP(*((uint32_t*)(block )), LD32(pixels ));\
423 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
424 pixels+=line_size;\
425 block +=line_size;\
426 }\
427 }\
428 \
429 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
430 {\
431 int i;\
432 for(i=0; i<h; i++){\
433 int j;\
434 for(j=0; j<2; j++){\
435 const uint32_t a= LD32(pixels );\
436 const uint32_t b= LD32(pixels+1);\
437 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
438 pixels+=4;\
439 block +=4;\
440 }\
441 pixels+=line_size-8;\
442 block +=line_size-8;\
443 }\
444 }\
445 \
446 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
447 {\
448 int i;\
449 for(i=0; i<h; i++){\
450 int j;\
451 for(j=0; j<2; j++){\
452 const uint32_t a= LD32(pixels );\
453 const uint32_t b= LD32(pixels+1);\
454 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
455 pixels+=4;\
456 block +=4;\
457 }\
458 pixels+=line_size-8;\
459 block +=line_size-8;\
460 }\
461 }\
462 \
463 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
464 {\
465 int i;\
466 for(i=0; i<h; i++){\
467 int j;\
468 for(j=0; j<2; j++){\
469 const uint32_t a= LD32(pixels );\
470 const uint32_t b= LD32(pixels+line_size);\
471 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
472 pixels+=4;\
473 block +=4;\
474 }\
475 pixels+=line_size-8;\
476 block +=line_size-8;\
477 }\
478 }\
479 \
480 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
481 {\
482 int i;\
483 for(i=0; i<h; i++){\
484 int j;\
485 for(j=0; j<2; j++){\
486 const uint32_t a= LD32(pixels );\
487 const uint32_t b= LD32(pixels+line_size);\
488 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
489 pixels+=4;\
490 block +=4;\
491 }\
492 pixels+=line_size-8;\
493 block +=line_size-8;\
494 }\
495 }\
496 \
497 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
498 {\
499 int j;\
500 for(j=0; j<2; j++){\
501 int i;\
502 const uint32_t a= LD32(pixels );\
503 const uint32_t b= LD32(pixels+1);\
504 uint32_t l0= (a&0x03030303UL)\
505 + (b&0x03030303UL)\
506 + 0x02020202UL;\
507 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
508 + ((b&0xFCFCFCFCUL)>>2);\
509 uint32_t l1,h1;\
510 \
511 pixels+=line_size;\
512 for(i=0; i<h; i+=2){\
513 uint32_t a= LD32(pixels );\
514 uint32_t b= LD32(pixels+1);\
515 l1= (a&0x03030303UL)\
516 + (b&0x03030303UL);\
517 h1= ((a&0xFCFCFCFCUL)>>2)\
518 + ((b&0xFCFCFCFCUL)>>2);\
519 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
520 pixels+=line_size;\
521 block +=line_size;\
522 a= LD32(pixels );\
523 b= LD32(pixels+1);\
524 l0= (a&0x03030303UL)\
525 + (b&0x03030303UL)\
526 + 0x02020202UL;\
527 h0= ((a&0xFCFCFCFCUL)>>2)\
528 + ((b&0xFCFCFCFCUL)>>2);\
529 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
530 pixels+=line_size;\
531 block +=line_size;\
532 }\
533 pixels+=4-line_size*(h+1);\
534 block +=4-line_size*h;\
535 }\
536 }\
537 \
538 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
539 {\
540 int j;\
541 for(j=0; j<2; j++){\
542 int i;\
543 const uint32_t a= LD32(pixels );\
544 const uint32_t b= LD32(pixels+1);\
545 uint32_t l0= (a&0x03030303UL)\
546 + (b&0x03030303UL)\
547 + 0x01010101UL;\
548 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
549 + ((b&0xFCFCFCFCUL)>>2);\
550 uint32_t l1,h1;\
551 \
552 pixels+=line_size;\
553 for(i=0; i<h; i+=2){\
554 uint32_t a= LD32(pixels );\
555 uint32_t b= LD32(pixels+1);\
556 l1= (a&0x03030303UL)\
557 + (b&0x03030303UL);\
558 h1= ((a&0xFCFCFCFCUL)>>2)\
559 + ((b&0xFCFCFCFCUL)>>2);\
560 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
561 pixels+=line_size;\
562 block +=line_size;\
563 a= LD32(pixels );\
564 b= LD32(pixels+1);\
565 l0= (a&0x03030303UL)\
566 + (b&0x03030303UL)\
567 + 0x01010101UL;\
568 h0= ((a&0xFCFCFCFCUL)>>2)\
569 + ((b&0xFCFCFCFCUL)>>2);\
570 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
571 pixels+=line_size;\
572 block +=line_size;\
573 }\
574 pixels+=4-line_size*(h+1);\
575 block +=4-line_size*h;\
576 }\
577 }\
578 \
579 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
580 OPNAME ## _pixels,\
581 OPNAME ## _pixels_x2,\
582 OPNAME ## _pixels_y2,\
583 OPNAME ## _pixels_xy2,\
584 };\
585 \
586 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
587 OPNAME ## _pixels,\
588 OPNAME ## _no_rnd_pixels_x2,\
589 OPNAME ## _no_rnd_pixels_y2,\
590 OPNAME ## _no_rnd_pixels_xy2,\
591 };
592 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
593 #endif
594
595 #define op_put(a, b) a = b
596
597 PIXOP2(avg, op_avg)
598 PIXOP2(put, op_put)
599 #undef op_avg
600 #undef op_put
601
602 #if 0
603 /* FIXME this stuff could be removed as its ot really used anymore */
604 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
605 \
606 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
607 { \
608 BTYPE *p; \
609 const UINT8 *pix; \
610 \
611 p = block; \
612 pix = pixels; \
613 do { \
614 OP(p[0], pix[0]); \
615 OP(p[1], pix[1]); \
616 OP(p[2], pix[2]); \
617 OP(p[3], pix[3]); \
618 OP(p[4], pix[4]); \
619 OP(p[5], pix[5]); \
620 OP(p[6], pix[6]); \
621 OP(p[7], pix[7]); \
622 pix += line_size; \
623 p += INCR; \
624 } while (--h);; \
625 } \
626 \
627 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
628 { \
629 BTYPE *p; \
630 const UINT8 *pix; \
631 \
632 p = block; \
633 pix = pixels; \
634 do { \
635 OP(p[0], avg2(pix[0], pix[1])); \
636 OP(p[1], avg2(pix[1], pix[2])); \
637 OP(p[2], avg2(pix[2], pix[3])); \
638 OP(p[3], avg2(pix[3], pix[4])); \
639 OP(p[4], avg2(pix[4], pix[5])); \
640 OP(p[5], avg2(pix[5], pix[6])); \
641 OP(p[6], avg2(pix[6], pix[7])); \
642 OP(p[7], avg2(pix[7], pix[8])); \
643 pix += line_size; \
644 p += INCR; \
645 } while (--h); \
646 } \
647 \
648 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
649 { \
650 BTYPE *p; \
651 const UINT8 *pix; \
652 const UINT8 *pix1; \
653 \
654 p = block; \
655 pix = pixels; \
656 pix1 = pixels + line_size; \
657 do { \
658 OP(p[0], avg2(pix[0], pix1[0])); \
659 OP(p[1], avg2(pix[1], pix1[1])); \
660 OP(p[2], avg2(pix[2], pix1[2])); \
661 OP(p[3], avg2(pix[3], pix1[3])); \
662 OP(p[4], avg2(pix[4], pix1[4])); \
663 OP(p[5], avg2(pix[5], pix1[5])); \
664 OP(p[6], avg2(pix[6], pix1[6])); \
665 OP(p[7], avg2(pix[7], pix1[7])); \
666 pix += line_size; \
667 pix1 += line_size; \
668 p += INCR; \
669 } while(--h); \
670 } \
671 \
672 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
673 { \
674 BTYPE *p; \
675 const UINT8 *pix; \
676 const UINT8 *pix1; \
677 \
678 p = block; \
679 pix = pixels; \
680 pix1 = pixels + line_size; \
681 do { \
682 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
683 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
684 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
685 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
686 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
687 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
688 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
689 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
690 pix += line_size; \
691 pix1 += line_size; \
692 p += INCR; \
693 } while(--h); \
694 } \
695 \
696 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
697 OPNAME ## _pixels, \
698 OPNAME ## _pixels_x2, \
699 OPNAME ## _pixels_y2, \
700 OPNAME ## _pixels_xy2, \
701 };
702
703 /* rounding primitives */
704 #define avg2(a,b) ((a+b+1)>>1)
705 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
706
707 #define op_avg(a, b) a = avg2(a, b)
708 #define op_sub(a, b) a -= b
709
710 PIXOP(DCTELEM, sub, op_sub, 8)
711
712 /* not rounding primitives */
713 #undef avg2
714 #undef avg4
715 #define avg2(a,b) ((a+b)>>1)
716 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
717
718 /* motion estimation */
719
720 #undef avg2
721 #undef avg4
722 #endif
723
724 #define avg2(a,b) ((a+b+1)>>1)
725 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
726
727 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
728 {
729 const int A=(16-x16)*(16-y16);
730 const int B=( x16)*(16-y16);
731 const int C=(16-x16)*( y16);
732 const int D=( x16)*( y16);
733 int i;
734 rounder= 128 - rounder;
735
736 for(i=0; i<h; i++)
737 {
738 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
739 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
740 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
741 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
742 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
743 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
744 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
745 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
746 dst+= srcStride;
747 src+= srcStride;
748 }
749 }
750
751 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
752 {
753 UINT8 *cm = cropTbl + MAX_NEG_CROP;
754 int i;
755 for(i=0; i<h; i++)
756 {
757 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
758 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
759 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
760 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
761 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
762 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
763 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
764 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
765 dst+=dstStride;
766 src+=srcStride;
767 }
768 }
769
770 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
771 {
772 UINT8 *cm = cropTbl + MAX_NEG_CROP;
773 int i;
774 for(i=0; i<w; i++)
775 {
776 const int src0= src[0*srcStride];
777 const int src1= src[1*srcStride];
778 const int src2= src[2*srcStride];
779 const int src3= src[3*srcStride];
780 const int src4= src[4*srcStride];
781 const int src5= src[5*srcStride];
782 const int src6= src[6*srcStride];
783 const int src7= src[7*srcStride];
784 const int src8= src[8*srcStride];
785 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
786 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
787 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
788 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
789 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
790 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
791 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
792 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
793 dst++;
794 src++;
795 }
796 }
797
798 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
799 {
800 int i;
801 for(i=0; i<8; i++)
802 {
803 dst[0]= src[0];
804 dst[1]= src[1];
805 dst[2]= src[2];
806 dst[3]= src[3];
807 dst[4]= src[4];
808 dst[5]= src[5];
809 dst[6]= src[6];
810 dst[7]= src[7];
811 dst+=dstStride;
812 src+=srcStride;
813 }
814 }
815
816 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
817 {
818 int i;
819 for(i=0; i<8; i++)
820 {
821 dst[0]= (src1[0] + src2[0] + r)>>1;
822 dst[1]= (src1[1] + src2[1] + r)>>1;
823 dst[2]= (src1[2] + src2[2] + r)>>1;
824 dst[3]= (src1[3] + src2[3] + r)>>1;
825 dst[4]= (src1[4] + src2[4] + r)>>1;
826 dst[5]= (src1[5] + src2[5] + r)>>1;
827 dst[6]= (src1[6] + src2[6] + r)>>1;
828 dst[7]= (src1[7] + src2[7] + r)>>1;
829 dst+=dstStride;
830 src1+=srcStride;
831 src2+=8;
832 }
833 }
834
835 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
836 {
837 int i;
838 for(i=0; i<8; i++)
839 {
840 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
841 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
842 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
843 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
844 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
845 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
846 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
847 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
848 dst+=dstStride;
849 src1+=srcStride;
850 src2+=8;
851 src3+=8;
852 src4+=8;
853 }
854 }
855
856 #define QPEL_MC(r, name) \
857 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
858 {\
859 put_block(dst, src, dstStride, srcStride);\
860 }\
861 \
862 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
863 {\
864 UINT8 half[64];\
865 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
866 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
867 }\
868 \
869 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
870 {\
871 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
872 }\
873 \
874 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
875 {\
876 UINT8 half[64];\
877 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
878 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
879 }\
880 \
881 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
882 {\
883 UINT8 half[64];\
884 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
885 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
886 }\
887 \
888 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
889 {\
890 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
891 }\
892 \
893 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
894 {\
895 UINT8 half[64];\
896 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
897 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
898 }\
899 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
900 {\
901 UINT8 halfH[72];\
902 UINT8 halfV[64];\
903 UINT8 halfHV[64];\
904 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
905 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
906 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
907 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
908 }\
909 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
910 {\
911 UINT8 halfH[72];\
912 UINT8 halfV[64];\
913 UINT8 halfHV[64];\
914 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
915 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
916 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
917 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
918 }\
919 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
920 {\
921 UINT8 halfH[72];\
922 UINT8 halfV[64];\
923 UINT8 halfHV[64];\
924 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
925 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
926 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
927 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
928 }\
929 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
930 {\
931 UINT8 halfH[72];\
932 UINT8 halfV[64];\
933 UINT8 halfHV[64];\
934 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
935 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
936 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
937 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
938 }\
939 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
940 {\
941 UINT8 halfH[72];\
942 UINT8 halfHV[64];\
943 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
944 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
945 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
946 }\
947 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
948 {\
949 UINT8 halfH[72];\
950 UINT8 halfHV[64];\
951 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
952 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
953 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
954 }\
955 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
956 {\
957 UINT8 halfH[72];\
958 UINT8 halfV[64];\
959 UINT8 halfHV[64];\
960 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
961 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
962 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
963 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
964 }\
965 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
966 {\
967 UINT8 halfH[72];\
968 UINT8 halfV[64];\
969 UINT8 halfHV[64];\
970 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
971 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
972 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
973 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
974 }\
975 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
976 {\
977 UINT8 halfH[72];\
978 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
979 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
980 }\
981 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
982 qpel_mc00_c ## name, \
983 qpel_mc10_c ## name, \
984 qpel_mc20_c ## name, \
985 qpel_mc30_c ## name, \
986 qpel_mc01_c ## name, \
987 qpel_mc11_c ## name, \
988 qpel_mc21_c ## name, \
989 qpel_mc31_c ## name, \
990 qpel_mc02_c ## name, \
991 qpel_mc12_c ## name, \
992 qpel_mc22_c ## name, \
993 qpel_mc32_c ## name, \
994 qpel_mc03_c ## name, \
995 qpel_mc13_c ## name, \
996 qpel_mc23_c ## name, \
997 qpel_mc33_c ## name, \
998 };
999
1000 QPEL_MC(0, _rnd)
1001 QPEL_MC(1, _no_rnd)
1002
1003 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1004 {
1005 int s, i;
1006
1007 s = 0;
1008 for(i=0;i<16;i++) {
1009 s += abs(pix1[0] - pix2[0]);
1010 s += abs(pix1[1] - pix2[1]);
1011 s += abs(pix1[2] - pix2[2]);
1012 s += abs(pix1[3] - pix2[3]);
1013 s += abs(pix1[4] - pix2[4]);
1014 s += abs(pix1[5] - pix2[5]);
1015 s += abs(pix1[6] - pix2[6]);
1016 s += abs(pix1[7] - pix2[7]);
1017 s += abs(pix1[8] - pix2[8]);
1018 s += abs(pix1[9] - pix2[9]);
1019 s += abs(pix1[10] - pix2[10]);
1020 s += abs(pix1[11] - pix2[11]);
1021 s += abs(pix1[12] - pix2[12]);
1022 s += abs(pix1[13] - pix2[13]);
1023 s += abs(pix1[14] - pix2[14]);
1024 s += abs(pix1[15] - pix2[15]);
1025 pix1 += line_size;
1026 pix2 += line_size;
1027 }
1028 return s;
1029 }
1030
1031 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1032 {
1033 int s, i;
1034
1035 s = 0;
1036 for(i=0;i<16;i++) {
1037 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1038 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1039 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1040 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1041 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1042 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1043 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1044 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1045 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1046 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1047 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1048 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1049 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1050 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1051 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1052 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1053 pix1 += line_size;
1054 pix2 += line_size;
1055 }
1056 return s;
1057 }
1058
1059 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1060 {
1061 int s, i;
1062 UINT8 *pix3 = pix2 + line_size;
1063
1064 s = 0;
1065 for(i=0;i<16;i++) {
1066 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1067 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1068 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1069 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1070 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1071 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1072 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1073 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1074 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1075 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1076 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1077 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1078 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1079 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1080 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1081 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1082 pix1 += line_size;
1083 pix2 += line_size;
1084 pix3 += line_size;
1085 }
1086 return s;
1087 }
1088
1089 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1090 {
1091 int s, i;
1092 UINT8 *pix3 = pix2 + line_size;
1093
1094 s = 0;
1095 for(i=0;i<16;i++) {
1096 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1097 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1098 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1099 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1100 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1101 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1102 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1103 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1104 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1105 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1106 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1107 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1108 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1109 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1110 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1111 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1112 pix1 += line_size;
1113 pix2 += line_size;
1114 pix3 += line_size;
1115 }
1116 return s;
1117 }
1118
1119 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1120 {
1121 int s, i;
1122
1123 s = 0;
1124 for(i=0;i<8;i++) {
1125 s += abs(pix1[0] - pix2[0]);
1126 s += abs(pix1[1] - pix2[1]);
1127 s += abs(pix1[2] - pix2[2]);
1128 s += abs(pix1[3] - pix2[3]);
1129 s += abs(pix1[4] - pix2[4]);
1130 s += abs(pix1[5] - pix2[5]);
1131 s += abs(pix1[6] - pix2[6]);
1132 s += abs(pix1[7] - pix2[7]);
1133 pix1 += line_size;
1134 pix2 += line_size;
1135 }
1136 return s;
1137 }
1138
1139 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1140 {
1141 int s, i;
1142
1143 s = 0;
1144 for(i=0;i<8;i++) {
1145 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1146 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1147 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1148 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1149 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1150 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1151 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1152 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1153 pix1 += line_size;
1154 pix2 += line_size;
1155 }
1156 return s;
1157 }
1158
1159 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1160 {
1161 int s, i;
1162 UINT8 *pix3 = pix2 + line_size;
1163
1164 s = 0;
1165 for(i=0;i<8;i++) {
1166 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1167 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1168 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1169 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1170 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1171 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1172 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1173 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1174 pix1 += line_size;
1175 pix2 += line_size;
1176 pix3 += line_size;
1177 }
1178 return s;
1179 }
1180
1181 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1182 {
1183 int s, i;
1184 UINT8 *pix3 = pix2 + line_size;
1185
1186 s = 0;
1187 for(i=0;i<8;i++) {
1188 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1189 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1190 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1191 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1192 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1193 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1194 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1195 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1196 pix1 += line_size;
1197 pix2 += line_size;
1198 pix3 += line_size;
1199 }
1200 return s;
1201 }
1202
1203 /* permute block according so that it corresponds to the MMX idct
1204 order */
1205 #ifdef SIMPLE_IDCT
1206 /* general permutation, but perhaps slightly slower */
1207 void block_permute(INT16 *block)
1208 {
1209 int i;
1210 INT16 temp[64];
1211
1212 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1213
1214 for(i=0; i<64; i++) block[i] = temp[i];
1215 }
1216 #else
1217
1218 void block_permute(INT16 *block)
1219 {
1220 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1221 int i;
1222
1223 for(i=0;i<8;i++) {
1224 tmp1 = block[1];
1225 tmp2 = block[2];
1226 tmp3 = block[3];
1227 tmp4 = block[4];
1228 tmp5 = block[5];
1229 tmp6 = block[6];
1230 block[1] = tmp2;
1231 block[2] = tmp4;
1232 block[3] = tmp6;
1233 block[4] = tmp1;
1234 block[5] = tmp3;
1235 block[6] = tmp5;
1236 block += 8;
1237 }
1238 }
1239 #endif
1240
1241 void clear_blocks_c(DCTELEM *blocks)
1242 {
1243 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1244 }
1245
1246 void dsputil_init(void)
1247 {
1248 int i, j;
1249 int use_permuted_idct;
1250
1251 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1252 for(i=0;i<MAX_NEG_CROP;i++) {
1253 cropTbl[i] = 0;
1254 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1255 }
1256
1257 for(i=0;i<512;i++) {
1258 squareTbl[i] = (i - 256) * (i - 256);
1259 }
1260
1261 #ifdef SIMPLE_IDCT
1262 ff_idct = simple_idct;
1263 #else
1264 ff_idct = j_rev_dct;
1265 #endif
1266 get_pixels = get_pixels_c;
1267 diff_pixels = diff_pixels_c;
1268 put_pixels_clamped = put_pixels_clamped_c;
1269 add_pixels_clamped = add_pixels_clamped_c;
1270 gmc1= gmc1_c;
1271 clear_blocks= clear_blocks_c;
1272
1273 pix_abs16x16 = pix_abs16x16_c;
1274 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1275 pix_abs16x16_y2 = pix_abs16x16_y2_c;
1276 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1277 pix_abs8x8 = pix_abs8x8_c;
1278 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1279 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1280 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1281 av_fdct = jpeg_fdct_ifast;
1282
1283 use_permuted_idct = 1;
1284
1285 #ifdef HAVE_MMX
1286 dsputil_init_mmx();
1287 #endif
1288 #ifdef ARCH_ARMV4L
1289 dsputil_init_armv4l();
1290 #endif
1291 #ifdef HAVE_MLIB
1292 dsputil_init_mlib();
1293 use_permuted_idct = 0;
1294 #endif
1295 #ifdef ARCH_ALPHA
1296 dsputil_init_alpha();
1297 use_permuted_idct = 0;
1298 #endif
1299
1300 #ifdef SIMPLE_IDCT
1301 if(ff_idct == simple_idct) use_permuted_idct=0;
1302 #endif
1303
1304 if(use_permuted_idct)
1305 #ifdef SIMPLE_IDCT
1306 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1307 #else
1308 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1309 #endif
1310 else
1311 for(i=0; i<64; i++) permutation[i]=i;
1312
1313 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1314 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1315
1316 if (use_permuted_idct) {
1317 /* permute for IDCT */
1318 for(i=0;i<64;i++) {
1319 j = zigzag_direct[i];
1320 zigzag_direct[i] = block_permute_op(j);
1321 j = ff_alternate_horizontal_scan[i];
1322 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1323 j = ff_alternate_vertical_scan[i];
1324 ff_alternate_vertical_scan[i] = block_permute_op(j);
1325 }
1326 block_permute(default_intra_matrix);
1327 block_permute(default_non_intra_matrix);
1328 block_permute(ff_mpeg4_default_intra_matrix);
1329 block_permute(ff_mpeg4_default_non_intra_matrix);
1330 }
1331
1332 build_zigzag_end();
1333 }
1334
1335 /* remove any non bit exact operation (testing purpose) */
1336 void avcodec_set_bit_exact(void)
1337 {
1338 #ifdef HAVE_MMX
1339 dsputil_set_bit_exact_mmx();
1340 #endif
1341 }
1342
1343 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1344 int orig_linesize[3], int coded_linesize,
1345 AVCodecContext *avctx)
1346 {
1347 int quad, diff, x, y;
1348 UINT8 *orig, *coded;
1349 UINT32 *sq = squareTbl + 256;
1350
1351 quad = 0;
1352 diff = 0;
1353
1354 /* Luminance */
1355 orig = orig_image[0];
1356 coded = coded_image[0];
1357
1358 for (y=0;y<avctx->height;y++) {
1359 for (x=0;x<avctx->width;x++) {
1360 diff = *(orig + x) - *(coded + x);
1361 quad += sq[diff];
1362 }
1363 orig += orig_linesize[0];
1364 coded += coded_linesize;
1365 }
1366
1367 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1368
1369 if (avctx->psnr_y) {
1370 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1371 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1372 } else
1373 avctx->psnr_y = 99.99;
1374 }
1375