move unaligned access macros to dsputil.h - added unaligned 32 bit store
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 */
21 #include "avcodec.h"
22 #include "dsputil.h"
23 #include "simple_idct.h"
24
25 void (*ff_idct)(DCTELEM *block);
26 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
28 void (*av_fdct)(DCTELEM *block);
29 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
30 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
31 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
33 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
34 void (*clear_blocks)(DCTELEM *blocks);
35
36 op_pixels_abs_func pix_abs16x16;
37 op_pixels_abs_func pix_abs16x16_x2;
38 op_pixels_abs_func pix_abs16x16_y2;
39 op_pixels_abs_func pix_abs16x16_xy2;
40
41 op_pixels_abs_func pix_abs8x8;
42 op_pixels_abs_func pix_abs8x8_x2;
43 op_pixels_abs_func pix_abs8x8_y2;
44 op_pixels_abs_func pix_abs8x8_xy2;
45
46 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
47 UINT32 squareTbl[512];
48
49 extern INT16 default_intra_matrix[64];
50 extern INT16 default_non_intra_matrix[64];
51 extern INT16 ff_mpeg4_default_intra_matrix[64];
52 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
53
54 UINT8 zigzag_direct[64] = {
55 0, 1, 8, 16, 9, 2, 3, 10,
56 17, 24, 32, 25, 18, 11, 4, 5,
57 12, 19, 26, 33, 40, 48, 41, 34,
58 27, 20, 13, 6, 7, 14, 21, 28,
59 35, 42, 49, 56, 57, 50, 43, 36,
60 29, 22, 15, 23, 30, 37, 44, 51,
61 58, 59, 52, 45, 38, 31, 39, 46,
62 53, 60, 61, 54, 47, 55, 62, 63
63 };
64
65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66 UINT16 __align8 inv_zigzag_direct16[64];
67
68 /* not permutated zigzag_direct for MMX quantizer */
69 UINT8 zigzag_direct_noperm[64];
70
71 UINT8 ff_alternate_horizontal_scan[64] = {
72 0, 1, 2, 3, 8, 9, 16, 17,
73 10, 11, 4, 5, 6, 7, 15, 14,
74 13, 12, 19, 18, 24, 25, 32, 33,
75 26, 27, 20, 21, 22, 23, 28, 29,
76 30, 31, 34, 35, 40, 41, 48, 49,
77 42, 43, 36, 37, 38, 39, 44, 45,
78 46, 47, 50, 51, 56, 57, 58, 59,
79 52, 53, 54, 55, 60, 61, 62, 63,
80 };
81
82 UINT8 ff_alternate_vertical_scan[64] = {
83 0, 8, 16, 24, 1, 9, 2, 10,
84 17, 25, 32, 40, 48, 56, 57, 49,
85 41, 33, 26, 18, 3, 11, 4, 12,
86 19, 27, 34, 42, 50, 58, 35, 43,
87 51, 59, 20, 28, 5, 13, 6, 14,
88 21, 29, 36, 44, 52, 60, 37, 45,
89 53, 61, 22, 30, 7, 15, 23, 31,
90 38, 46, 54, 62, 39, 47, 55, 63,
91 };
92
93 #ifdef SIMPLE_IDCT
94
95 /* Input permutation for the simple_idct_mmx */
96 static UINT8 simple_mmx_permutation[64]={
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
105 };
106 #endif
107
108 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
109 UINT32 inverse[256]={
110 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
111 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
112 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
113 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
114 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
115 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
116 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
117 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
118 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
119 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
120 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
121 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
122 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
123 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
124 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
125 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
126 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
127 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
128 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
129 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
130 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
131 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
132 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
133 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
134 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
135 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
136 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
137 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
138 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
139 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
140 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
141 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
142 };
143
144 /* used to skip zeros at the end */
145 UINT8 zigzag_end[64];
146
147 UINT8 permutation[64];
148 //UINT8 invPermutation[64];
149
150 static void build_zigzag_end(void)
151 {
152 int lastIndex;
153 int lastIndexAfterPerm=0;
154 for(lastIndex=0; lastIndex<64; lastIndex++)
155 {
156 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
157 lastIndexAfterPerm= zigzag_direct[lastIndex];
158 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
159 }
160 }
161
162 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
163 {
164 DCTELEM *p;
165 const UINT8 *pix;
166 int i;
167
168 /* read the pixels */
169 p = block;
170 pix = pixels;
171 for(i=0;i<8;i++) {
172 p[0] = pix[0];
173 p[1] = pix[1];
174 p[2] = pix[2];
175 p[3] = pix[3];
176 p[4] = pix[4];
177 p[5] = pix[5];
178 p[6] = pix[6];
179 p[7] = pix[7];
180 pix += line_size;
181 p += 8;
182 }
183 }
184
185 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
186 DCTELEM *p;
187 int i;
188
189 /* read the pixels */
190 p = block;
191 for(i=0;i<8;i++) {
192 p[0] = s1[0] - s2[0];
193 p[1] = s1[1] - s2[1];
194 p[2] = s1[2] - s2[2];
195 p[3] = s1[3] - s2[3];
196 p[4] = s1[4] - s2[4];
197 p[5] = s1[5] - s2[5];
198 p[6] = s1[6] - s2[6];
199 p[7] = s1[7] - s2[7];
200 s1 += stride;
201 s2 += stride;
202 p += 8;
203 }
204 }
205
206
207 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
208 {
209 const DCTELEM *p;
210 UINT8 *pix;
211 int i;
212 UINT8 *cm = cropTbl + MAX_NEG_CROP;
213
214 /* read the pixels */
215 p = block;
216 pix = pixels;
217 for(i=0;i<8;i++) {
218 pix[0] = cm[p[0]];
219 pix[1] = cm[p[1]];
220 pix[2] = cm[p[2]];
221 pix[3] = cm[p[3]];
222 pix[4] = cm[p[4]];
223 pix[5] = cm[p[5]];
224 pix[6] = cm[p[6]];
225 pix[7] = cm[p[7]];
226 pix += line_size;
227 p += 8;
228 }
229 }
230
231 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
232 {
233 const DCTELEM *p;
234 UINT8 *pix;
235 int i;
236 UINT8 *cm = cropTbl + MAX_NEG_CROP;
237
238 /* read the pixels */
239 p = block;
240 pix = pixels;
241 for(i=0;i<8;i++) {
242 pix[0] = cm[pix[0] + p[0]];
243 pix[1] = cm[pix[1] + p[1]];
244 pix[2] = cm[pix[2] + p[2]];
245 pix[3] = cm[pix[3] + p[3]];
246 pix[4] = cm[pix[4] + p[4]];
247 pix[5] = cm[pix[5] + p[5]];
248 pix[6] = cm[pix[6] + p[6]];
249 pix[7] = cm[pix[7] + p[7]];
250 pix += line_size;
251 p += 8;
252 }
253 }
254
255 #if 0
256
257 #define PIXOP2(OPNAME, OP) \
258 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
259 {\
260 int i;\
261 for(i=0; i<h; i++){\
262 OP(*((uint64_t*)block), LD64(pixels));\
263 pixels+=line_size;\
264 block +=line_size;\
265 }\
266 }\
267 \
268 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
269 {\
270 int i;\
271 for(i=0; i<h; i++){\
272 const uint64_t a= LD64(pixels );\
273 const uint64_t b= LD64(pixels+1);\
274 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
275 pixels+=line_size;\
276 block +=line_size;\
277 }\
278 }\
279 \
280 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
281 {\
282 int i;\
283 for(i=0; i<h; i++){\
284 const uint64_t a= LD64(pixels );\
285 const uint64_t b= LD64(pixels+1);\
286 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
287 pixels+=line_size;\
288 block +=line_size;\
289 }\
290 }\
291 \
292 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
293 {\
294 int i;\
295 for(i=0; i<h; i++){\
296 const uint64_t a= LD64(pixels );\
297 const uint64_t b= LD64(pixels+line_size);\
298 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
299 pixels+=line_size;\
300 block +=line_size;\
301 }\
302 }\
303 \
304 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
305 {\
306 int i;\
307 for(i=0; i<h; i++){\
308 const uint64_t a= LD64(pixels );\
309 const uint64_t b= LD64(pixels+line_size);\
310 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
311 pixels+=line_size;\
312 block +=line_size;\
313 }\
314 }\
315 \
316 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
317 {\
318 int i;\
319 const uint64_t a= LD64(pixels );\
320 const uint64_t b= LD64(pixels+1);\
321 uint64_t l0= (a&0x0303030303030303ULL)\
322 + (b&0x0303030303030303ULL)\
323 + 0x0202020202020202ULL;\
324 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
325 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
326 uint64_t l1,h1;\
327 \
328 pixels+=line_size;\
329 for(i=0; i<h; i+=2){\
330 uint64_t a= LD64(pixels );\
331 uint64_t b= LD64(pixels+1);\
332 l1= (a&0x0303030303030303ULL)\
333 + (b&0x0303030303030303ULL);\
334 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
335 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
336 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
337 pixels+=line_size;\
338 block +=line_size;\
339 a= LD64(pixels );\
340 b= LD64(pixels+1);\
341 l0= (a&0x0303030303030303ULL)\
342 + (b&0x0303030303030303ULL)\
343 + 0x0202020202020202ULL;\
344 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
345 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
346 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
347 pixels+=line_size;\
348 block +=line_size;\
349 }\
350 }\
351 \
352 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
353 {\
354 int i;\
355 const uint64_t a= LD64(pixels );\
356 const uint64_t b= LD64(pixels+1);\
357 uint64_t l0= (a&0x0303030303030303ULL)\
358 + (b&0x0303030303030303ULL)\
359 + 0x0101010101010101ULL;\
360 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
361 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
362 uint64_t l1,h1;\
363 \
364 pixels+=line_size;\
365 for(i=0; i<h; i+=2){\
366 uint64_t a= LD64(pixels );\
367 uint64_t b= LD64(pixels+1);\
368 l1= (a&0x0303030303030303ULL)\
369 + (b&0x0303030303030303ULL);\
370 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
371 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
372 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
373 pixels+=line_size;\
374 block +=line_size;\
375 a= LD64(pixels );\
376 b= LD64(pixels+1);\
377 l0= (a&0x0303030303030303ULL)\
378 + (b&0x0303030303030303ULL)\
379 + 0x0101010101010101ULL;\
380 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
381 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
382 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
383 pixels+=line_size;\
384 block +=line_size;\
385 }\
386 }\
387 \
388 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
389 OPNAME ## _pixels,\
390 OPNAME ## _pixels_x2,\
391 OPNAME ## _pixels_y2,\
392 OPNAME ## _pixels_xy2,\
393 };\
394 \
395 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
396 OPNAME ## _pixels,\
397 OPNAME ## _no_rnd_pixels_x2,\
398 OPNAME ## _no_rnd_pixels_y2,\
399 OPNAME ## _no_rnd_pixels_xy2,\
400 };
401
402 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
403 #else // 64 bit variant
404
405 #define PIXOP2(OPNAME, OP) \
406 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
407 {\
408 int i;\
409 for(i=0; i<h; i++){\
410 OP(*((uint32_t*)(block )), LD32(pixels ));\
411 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
412 pixels+=line_size;\
413 block +=line_size;\
414 }\
415 }\
416 \
417 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
418 {\
419 int i;\
420 for(i=0; i<h; i++){\
421 int j;\
422 for(j=0; j<2; j++){\
423 const uint32_t a= LD32(pixels );\
424 const uint32_t b= LD32(pixels+1);\
425 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
426 pixels+=4;\
427 block +=4;\
428 }\
429 pixels+=line_size-8;\
430 block +=line_size-8;\
431 }\
432 }\
433 \
434 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
435 {\
436 int i;\
437 for(i=0; i<h; i++){\
438 int j;\
439 for(j=0; j<2; j++){\
440 const uint32_t a= LD32(pixels );\
441 const uint32_t b= LD32(pixels+1);\
442 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
443 pixels+=4;\
444 block +=4;\
445 }\
446 pixels+=line_size-8;\
447 block +=line_size-8;\
448 }\
449 }\
450 \
451 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
452 {\
453 int i;\
454 for(i=0; i<h; i++){\
455 int j;\
456 for(j=0; j<2; j++){\
457 const uint32_t a= LD32(pixels );\
458 const uint32_t b= LD32(pixels+line_size);\
459 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
460 pixels+=4;\
461 block +=4;\
462 }\
463 pixels+=line_size-8;\
464 block +=line_size-8;\
465 }\
466 }\
467 \
468 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
469 {\
470 int i;\
471 for(i=0; i<h; i++){\
472 int j;\
473 for(j=0; j<2; j++){\
474 const uint32_t a= LD32(pixels );\
475 const uint32_t b= LD32(pixels+line_size);\
476 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
477 pixels+=4;\
478 block +=4;\
479 }\
480 pixels+=line_size-8;\
481 block +=line_size-8;\
482 }\
483 }\
484 \
485 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
486 {\
487 int j;\
488 for(j=0; j<2; j++){\
489 int i;\
490 const uint32_t a= LD32(pixels );\
491 const uint32_t b= LD32(pixels+1);\
492 uint32_t l0= (a&0x03030303UL)\
493 + (b&0x03030303UL)\
494 + 0x02020202UL;\
495 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
496 + ((b&0xFCFCFCFCUL)>>2);\
497 uint32_t l1,h1;\
498 \
499 pixels+=line_size;\
500 for(i=0; i<h; i+=2){\
501 uint32_t a= LD32(pixels );\
502 uint32_t b= LD32(pixels+1);\
503 l1= (a&0x03030303UL)\
504 + (b&0x03030303UL);\
505 h1= ((a&0xFCFCFCFCUL)>>2)\
506 + ((b&0xFCFCFCFCUL)>>2);\
507 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
508 pixels+=line_size;\
509 block +=line_size;\
510 a= LD32(pixels );\
511 b= LD32(pixels+1);\
512 l0= (a&0x03030303UL)\
513 + (b&0x03030303UL)\
514 + 0x02020202UL;\
515 h0= ((a&0xFCFCFCFCUL)>>2)\
516 + ((b&0xFCFCFCFCUL)>>2);\
517 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
518 pixels+=line_size;\
519 block +=line_size;\
520 }\
521 pixels+=4-line_size*(h+1);\
522 block +=4-line_size*h;\
523 }\
524 }\
525 \
526 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
527 {\
528 int j;\
529 for(j=0; j<2; j++){\
530 int i;\
531 const uint32_t a= LD32(pixels );\
532 const uint32_t b= LD32(pixels+1);\
533 uint32_t l0= (a&0x03030303UL)\
534 + (b&0x03030303UL)\
535 + 0x01010101UL;\
536 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
537 + ((b&0xFCFCFCFCUL)>>2);\
538 uint32_t l1,h1;\
539 \
540 pixels+=line_size;\
541 for(i=0; i<h; i+=2){\
542 uint32_t a= LD32(pixels );\
543 uint32_t b= LD32(pixels+1);\
544 l1= (a&0x03030303UL)\
545 + (b&0x03030303UL);\
546 h1= ((a&0xFCFCFCFCUL)>>2)\
547 + ((b&0xFCFCFCFCUL)>>2);\
548 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
549 pixels+=line_size;\
550 block +=line_size;\
551 a= LD32(pixels );\
552 b= LD32(pixels+1);\
553 l0= (a&0x03030303UL)\
554 + (b&0x03030303UL)\
555 + 0x01010101UL;\
556 h0= ((a&0xFCFCFCFCUL)>>2)\
557 + ((b&0xFCFCFCFCUL)>>2);\
558 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
559 pixels+=line_size;\
560 block +=line_size;\
561 }\
562 pixels+=4-line_size*(h+1);\
563 block +=4-line_size*h;\
564 }\
565 }\
566 \
567 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
568 OPNAME ## _pixels,\
569 OPNAME ## _pixels_x2,\
570 OPNAME ## _pixels_y2,\
571 OPNAME ## _pixels_xy2,\
572 };\
573 \
574 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
575 OPNAME ## _pixels,\
576 OPNAME ## _no_rnd_pixels_x2,\
577 OPNAME ## _no_rnd_pixels_y2,\
578 OPNAME ## _no_rnd_pixels_xy2,\
579 };
580 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
581 #endif
582
583 #define op_put(a, b) a = b
584
585 PIXOP2(avg, op_avg)
586 PIXOP2(put, op_put)
587 #undef op_avg
588 #undef op_put
589
590 #if 0
591 /* FIXME this stuff could be removed as its ot really used anymore */
592 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
593 \
594 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
595 { \
596 BTYPE *p; \
597 const UINT8 *pix; \
598 \
599 p = block; \
600 pix = pixels; \
601 do { \
602 OP(p[0], pix[0]); \
603 OP(p[1], pix[1]); \
604 OP(p[2], pix[2]); \
605 OP(p[3], pix[3]); \
606 OP(p[4], pix[4]); \
607 OP(p[5], pix[5]); \
608 OP(p[6], pix[6]); \
609 OP(p[7], pix[7]); \
610 pix += line_size; \
611 p += INCR; \
612 } while (--h);; \
613 } \
614 \
615 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
616 { \
617 BTYPE *p; \
618 const UINT8 *pix; \
619 \
620 p = block; \
621 pix = pixels; \
622 do { \
623 OP(p[0], avg2(pix[0], pix[1])); \
624 OP(p[1], avg2(pix[1], pix[2])); \
625 OP(p[2], avg2(pix[2], pix[3])); \
626 OP(p[3], avg2(pix[3], pix[4])); \
627 OP(p[4], avg2(pix[4], pix[5])); \
628 OP(p[5], avg2(pix[5], pix[6])); \
629 OP(p[6], avg2(pix[6], pix[7])); \
630 OP(p[7], avg2(pix[7], pix[8])); \
631 pix += line_size; \
632 p += INCR; \
633 } while (--h); \
634 } \
635 \
636 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
637 { \
638 BTYPE *p; \
639 const UINT8 *pix; \
640 const UINT8 *pix1; \
641 \
642 p = block; \
643 pix = pixels; \
644 pix1 = pixels + line_size; \
645 do { \
646 OP(p[0], avg2(pix[0], pix1[0])); \
647 OP(p[1], avg2(pix[1], pix1[1])); \
648 OP(p[2], avg2(pix[2], pix1[2])); \
649 OP(p[3], avg2(pix[3], pix1[3])); \
650 OP(p[4], avg2(pix[4], pix1[4])); \
651 OP(p[5], avg2(pix[5], pix1[5])); \
652 OP(p[6], avg2(pix[6], pix1[6])); \
653 OP(p[7], avg2(pix[7], pix1[7])); \
654 pix += line_size; \
655 pix1 += line_size; \
656 p += INCR; \
657 } while(--h); \
658 } \
659 \
660 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
661 { \
662 BTYPE *p; \
663 const UINT8 *pix; \
664 const UINT8 *pix1; \
665 \
666 p = block; \
667 pix = pixels; \
668 pix1 = pixels + line_size; \
669 do { \
670 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
671 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
672 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
673 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
674 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
675 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
676 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
677 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
678 pix += line_size; \
679 pix1 += line_size; \
680 p += INCR; \
681 } while(--h); \
682 } \
683 \
684 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
685 OPNAME ## _pixels, \
686 OPNAME ## _pixels_x2, \
687 OPNAME ## _pixels_y2, \
688 OPNAME ## _pixels_xy2, \
689 };
690
691 /* rounding primitives */
692 #define avg2(a,b) ((a+b+1)>>1)
693 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
694
695 #define op_avg(a, b) a = avg2(a, b)
696 #define op_sub(a, b) a -= b
697
698 PIXOP(DCTELEM, sub, op_sub, 8)
699
700 /* not rounding primitives */
701 #undef avg2
702 #undef avg4
703 #define avg2(a,b) ((a+b)>>1)
704 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
705
706 /* motion estimation */
707
708 #undef avg2
709 #undef avg4
710 #endif
711
712 #define avg2(a,b) ((a+b+1)>>1)
713 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
714
715 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
716 {
717 const int A=(16-x16)*(16-y16);
718 const int B=( x16)*(16-y16);
719 const int C=(16-x16)*( y16);
720 const int D=( x16)*( y16);
721 int i;
722 rounder= 128 - rounder;
723
724 for(i=0; i<h; i++)
725 {
726 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
727 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
728 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
729 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
730 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
731 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
732 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
733 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
734 dst+= srcStride;
735 src+= srcStride;
736 }
737 }
738
739 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
740 {
741 UINT8 *cm = cropTbl + MAX_NEG_CROP;
742 int i;
743 for(i=0; i<h; i++)
744 {
745 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
746 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
747 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
748 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
749 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
750 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
751 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
752 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
753 dst+=dstStride;
754 src+=srcStride;
755 }
756 }
757
758 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
759 {
760 UINT8 *cm = cropTbl + MAX_NEG_CROP;
761 int i;
762 for(i=0; i<w; i++)
763 {
764 const int src0= src[0*srcStride];
765 const int src1= src[1*srcStride];
766 const int src2= src[2*srcStride];
767 const int src3= src[3*srcStride];
768 const int src4= src[4*srcStride];
769 const int src5= src[5*srcStride];
770 const int src6= src[6*srcStride];
771 const int src7= src[7*srcStride];
772 const int src8= src[8*srcStride];
773 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
774 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
775 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
776 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
777 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
778 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
779 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
780 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
781 dst++;
782 src++;
783 }
784 }
785
786 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
787 {
788 int i;
789 for(i=0; i<8; i++)
790 {
791 dst[0]= src[0];
792 dst[1]= src[1];
793 dst[2]= src[2];
794 dst[3]= src[3];
795 dst[4]= src[4];
796 dst[5]= src[5];
797 dst[6]= src[6];
798 dst[7]= src[7];
799 dst+=dstStride;
800 src+=srcStride;
801 }
802 }
803
804 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
805 {
806 int i;
807 for(i=0; i<8; i++)
808 {
809 dst[0]= (src1[0] + src2[0] + r)>>1;
810 dst[1]= (src1[1] + src2[1] + r)>>1;
811 dst[2]= (src1[2] + src2[2] + r)>>1;
812 dst[3]= (src1[3] + src2[3] + r)>>1;
813 dst[4]= (src1[4] + src2[4] + r)>>1;
814 dst[5]= (src1[5] + src2[5] + r)>>1;
815 dst[6]= (src1[6] + src2[6] + r)>>1;
816 dst[7]= (src1[7] + src2[7] + r)>>1;
817 dst+=dstStride;
818 src1+=srcStride;
819 src2+=8;
820 }
821 }
822
823 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
824 {
825 int i;
826 for(i=0; i<8; i++)
827 {
828 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
829 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
830 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
831 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
832 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
833 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
834 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
835 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
836 dst+=dstStride;
837 src1+=srcStride;
838 src2+=8;
839 src3+=8;
840 src4+=8;
841 }
842 }
843
844 #define QPEL_MC(r, name) \
845 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
846 {\
847 put_block(dst, src, dstStride, srcStride);\
848 }\
849 \
850 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
851 {\
852 UINT8 half[64];\
853 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
854 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
855 }\
856 \
857 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
858 {\
859 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
860 }\
861 \
862 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
863 {\
864 UINT8 half[64];\
865 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
866 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
867 }\
868 \
869 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
870 {\
871 UINT8 half[64];\
872 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
873 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
874 }\
875 \
876 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
877 {\
878 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
879 }\
880 \
881 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
882 {\
883 UINT8 half[64];\
884 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
885 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
886 }\
887 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
888 {\
889 UINT8 halfH[72];\
890 UINT8 halfV[64];\
891 UINT8 halfHV[64];\
892 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
893 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
894 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
895 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
896 }\
897 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
898 {\
899 UINT8 halfH[72];\
900 UINT8 halfV[64];\
901 UINT8 halfHV[64];\
902 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
903 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
904 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
905 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
906 }\
907 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
908 {\
909 UINT8 halfH[72];\
910 UINT8 halfV[64];\
911 UINT8 halfHV[64];\
912 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
913 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
914 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
915 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
916 }\
917 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
918 {\
919 UINT8 halfH[72];\
920 UINT8 halfV[64];\
921 UINT8 halfHV[64];\
922 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
923 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
924 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
925 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
926 }\
927 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
928 {\
929 UINT8 halfH[72];\
930 UINT8 halfHV[64];\
931 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
932 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
933 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
934 }\
935 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
936 {\
937 UINT8 halfH[72];\
938 UINT8 halfHV[64];\
939 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
940 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
941 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
942 }\
943 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
944 {\
945 UINT8 halfH[72];\
946 UINT8 halfV[64];\
947 UINT8 halfHV[64];\
948 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
949 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
950 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
951 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
952 }\
953 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
954 {\
955 UINT8 halfH[72];\
956 UINT8 halfV[64];\
957 UINT8 halfHV[64];\
958 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
959 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
960 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
961 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
962 }\
963 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
964 {\
965 UINT8 halfH[72];\
966 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
967 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
968 }\
969 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
970 qpel_mc00_c ## name, \
971 qpel_mc10_c ## name, \
972 qpel_mc20_c ## name, \
973 qpel_mc30_c ## name, \
974 qpel_mc01_c ## name, \
975 qpel_mc11_c ## name, \
976 qpel_mc21_c ## name, \
977 qpel_mc31_c ## name, \
978 qpel_mc02_c ## name, \
979 qpel_mc12_c ## name, \
980 qpel_mc22_c ## name, \
981 qpel_mc32_c ## name, \
982 qpel_mc03_c ## name, \
983 qpel_mc13_c ## name, \
984 qpel_mc23_c ## name, \
985 qpel_mc33_c ## name, \
986 };
987
988 QPEL_MC(0, _rnd)
989 QPEL_MC(1, _no_rnd)
990
991 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
992 {
993 int s, i;
994
995 s = 0;
996 for(i=0;i<16;i++) {
997 s += abs(pix1[0] - pix2[0]);
998 s += abs(pix1[1] - pix2[1]);
999 s += abs(pix1[2] - pix2[2]);
1000 s += abs(pix1[3] - pix2[3]);
1001 s += abs(pix1[4] - pix2[4]);
1002 s += abs(pix1[5] - pix2[5]);
1003 s += abs(pix1[6] - pix2[6]);
1004 s += abs(pix1[7] - pix2[7]);
1005 s += abs(pix1[8] - pix2[8]);
1006 s += abs(pix1[9] - pix2[9]);
1007 s += abs(pix1[10] - pix2[10]);
1008 s += abs(pix1[11] - pix2[11]);
1009 s += abs(pix1[12] - pix2[12]);
1010 s += abs(pix1[13] - pix2[13]);
1011 s += abs(pix1[14] - pix2[14]);
1012 s += abs(pix1[15] - pix2[15]);
1013 pix1 += line_size;
1014 pix2 += line_size;
1015 }
1016 return s;
1017 }
1018
1019 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1020 {
1021 int s, i;
1022
1023 s = 0;
1024 for(i=0;i<16;i++) {
1025 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1026 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1027 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1028 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1029 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1030 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1031 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1032 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1033 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1034 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1035 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1036 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1037 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1038 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1039 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1040 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1041 pix1 += line_size;
1042 pix2 += line_size;
1043 }
1044 return s;
1045 }
1046
1047 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1048 {
1049 int s, i;
1050 UINT8 *pix3 = pix2 + line_size;
1051
1052 s = 0;
1053 for(i=0;i<16;i++) {
1054 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1055 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1056 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1057 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1058 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1059 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1060 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1061 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1062 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1063 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1064 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1065 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1066 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1067 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1068 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1069 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1070 pix1 += line_size;
1071 pix2 += line_size;
1072 pix3 += line_size;
1073 }
1074 return s;
1075 }
1076
1077 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1078 {
1079 int s, i;
1080 UINT8 *pix3 = pix2 + line_size;
1081
1082 s = 0;
1083 for(i=0;i<16;i++) {
1084 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1085 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1086 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1087 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1088 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1089 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1090 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1091 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1092 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1093 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1094 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1095 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1096 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1097 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1098 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1099 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1100 pix1 += line_size;
1101 pix2 += line_size;
1102 pix3 += line_size;
1103 }
1104 return s;
1105 }
1106
1107 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1108 {
1109 int s, i;
1110
1111 s = 0;
1112 for(i=0;i<8;i++) {
1113 s += abs(pix1[0] - pix2[0]);
1114 s += abs(pix1[1] - pix2[1]);
1115 s += abs(pix1[2] - pix2[2]);
1116 s += abs(pix1[3] - pix2[3]);
1117 s += abs(pix1[4] - pix2[4]);
1118 s += abs(pix1[5] - pix2[5]);
1119 s += abs(pix1[6] - pix2[6]);
1120 s += abs(pix1[7] - pix2[7]);
1121 pix1 += line_size;
1122 pix2 += line_size;
1123 }
1124 return s;
1125 }
1126
1127 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1128 {
1129 int s, i;
1130
1131 s = 0;
1132 for(i=0;i<8;i++) {
1133 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1134 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1135 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1136 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1137 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1138 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1139 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1140 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1141 pix1 += line_size;
1142 pix2 += line_size;
1143 }
1144 return s;
1145 }
1146
1147 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1148 {
1149 int s, i;
1150 UINT8 *pix3 = pix2 + line_size;
1151
1152 s = 0;
1153 for(i=0;i<8;i++) {
1154 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1155 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1156 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1157 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1158 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1159 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1160 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1161 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1162 pix1 += line_size;
1163 pix2 += line_size;
1164 pix3 += line_size;
1165 }
1166 return s;
1167 }
1168
1169 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1170 {
1171 int s, i;
1172 UINT8 *pix3 = pix2 + line_size;
1173
1174 s = 0;
1175 for(i=0;i<8;i++) {
1176 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1177 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1178 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1179 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1180 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1181 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1182 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1183 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1184 pix1 += line_size;
1185 pix2 += line_size;
1186 pix3 += line_size;
1187 }
1188 return s;
1189 }
1190
1191 /* permute block according so that it corresponds to the MMX idct
1192 order */
1193 #ifdef SIMPLE_IDCT
1194 /* general permutation, but perhaps slightly slower */
1195 void block_permute(INT16 *block)
1196 {
1197 int i;
1198 INT16 temp[64];
1199
1200 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1201
1202 for(i=0; i<64; i++) block[i] = temp[i];
1203 }
1204 #else
1205
1206 void block_permute(INT16 *block)
1207 {
1208 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1209 int i;
1210
1211 for(i=0;i<8;i++) {
1212 tmp1 = block[1];
1213 tmp2 = block[2];
1214 tmp3 = block[3];
1215 tmp4 = block[4];
1216 tmp5 = block[5];
1217 tmp6 = block[6];
1218 block[1] = tmp2;
1219 block[2] = tmp4;
1220 block[3] = tmp6;
1221 block[4] = tmp1;
1222 block[5] = tmp3;
1223 block[6] = tmp5;
1224 block += 8;
1225 }
1226 }
1227 #endif
1228
1229 void clear_blocks_c(DCTELEM *blocks)
1230 {
1231 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1232 }
1233
1234 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1235 converted */
1236 void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1237 {
1238 ff_idct (block);
1239 put_pixels_clamped(block, dest, line_size);
1240 }
1241
1242 void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1243 {
1244 ff_idct (block);
1245 add_pixels_clamped(block, dest, line_size);
1246 }
1247
1248 void dsputil_init(void)
1249 {
1250 int i, j;
1251 int use_permuted_idct;
1252
1253 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1254 for(i=0;i<MAX_NEG_CROP;i++) {
1255 cropTbl[i] = 0;
1256 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1257 }
1258
1259 for(i=0;i<512;i++) {
1260 squareTbl[i] = (i - 256) * (i - 256);
1261 }
1262
1263 #ifdef SIMPLE_IDCT
1264 ff_idct = NULL;
1265 #else
1266 ff_idct = j_rev_dct;
1267 #endif
1268 get_pixels = get_pixels_c;
1269 diff_pixels = diff_pixels_c;
1270 put_pixels_clamped = put_pixels_clamped_c;
1271 add_pixels_clamped = add_pixels_clamped_c;
1272 gmc1= gmc1_c;
1273 clear_blocks= clear_blocks_c;
1274
1275 pix_abs16x16 = pix_abs16x16_c;
1276 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1277 pix_abs16x16_y2 = pix_abs16x16_y2_c;
1278 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1279 pix_abs8x8 = pix_abs8x8_c;
1280 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1281 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1282 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1283 av_fdct = fdct_ifast;
1284
1285 use_permuted_idct = 1;
1286
1287 #ifdef HAVE_MMX
1288 dsputil_init_mmx();
1289 #endif
1290 #ifdef ARCH_ARMV4L
1291 dsputil_init_armv4l();
1292 #endif
1293 #ifdef HAVE_MLIB
1294 dsputil_init_mlib();
1295 use_permuted_idct = 0;
1296 #endif
1297 #ifdef ARCH_ALPHA
1298 dsputil_init_alpha();
1299 use_permuted_idct = 0;
1300 #endif
1301
1302 #ifdef SIMPLE_IDCT
1303 if (ff_idct == NULL) {
1304 ff_idct_put = simple_idct_put;
1305 ff_idct_add = simple_idct_add;
1306 use_permuted_idct=0;
1307 } else {
1308 ff_idct_put = gen_idct_put;
1309 ff_idct_add = gen_idct_add;
1310 }
1311 #endif
1312
1313 if(use_permuted_idct)
1314 #ifdef SIMPLE_IDCT
1315 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1316 #else
1317 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1318 #endif
1319 else
1320 for(i=0; i<64; i++) permutation[i]=i;
1321
1322 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1323 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1324
1325 if (use_permuted_idct) {
1326 /* permute for IDCT */
1327 for(i=0;i<64;i++) {
1328 j = zigzag_direct[i];
1329 zigzag_direct[i] = block_permute_op(j);
1330 j = ff_alternate_horizontal_scan[i];
1331 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1332 j = ff_alternate_vertical_scan[i];
1333 ff_alternate_vertical_scan[i] = block_permute_op(j);
1334 }
1335 block_permute(default_intra_matrix);
1336 block_permute(default_non_intra_matrix);
1337 block_permute(ff_mpeg4_default_intra_matrix);
1338 block_permute(ff_mpeg4_default_non_intra_matrix);
1339 }
1340
1341 build_zigzag_end();
1342 }
1343
1344 /* remove any non bit exact operation (testing purpose) */
1345 void avcodec_set_bit_exact(void)
1346 {
1347 #ifdef HAVE_MMX
1348 dsputil_set_bit_exact_mmx();
1349 #endif
1350 }
1351
1352 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1353 int orig_linesize[3], int coded_linesize,
1354 AVCodecContext *avctx)
1355 {
1356 int quad, diff, x, y;
1357 UINT8 *orig, *coded;
1358 UINT32 *sq = squareTbl + 256;
1359
1360 quad = 0;
1361 diff = 0;
1362
1363 /* Luminance */
1364 orig = orig_image[0];
1365 coded = coded_image[0];
1366
1367 for (y=0;y<avctx->height;y++) {
1368 for (x=0;x<avctx->width;x++) {
1369 diff = *(orig + x) - *(coded + x);
1370 quad += sq[diff];
1371 }
1372 orig += orig_linesize[0];
1373 coded += coded_linesize;
1374 }
1375
1376 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1377
1378 if (avctx->psnr_y) {
1379 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1380 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1381 } else
1382 avctx->psnr_y = 99.99;
1383 }
1384