added ff_idct_put/add
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 */
21 #include "avcodec.h"
22 #include "dsputil.h"
23 #include "simple_idct.h"
24
25 void (*ff_idct)(DCTELEM *block);
26 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
28 void (*av_fdct)(DCTELEM *block);
29 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
30 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
31 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
33 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
34 void (*clear_blocks)(DCTELEM *blocks);
35
36 op_pixels_abs_func pix_abs16x16;
37 op_pixels_abs_func pix_abs16x16_x2;
38 op_pixels_abs_func pix_abs16x16_y2;
39 op_pixels_abs_func pix_abs16x16_xy2;
40
41 op_pixels_abs_func pix_abs8x8;
42 op_pixels_abs_func pix_abs8x8_x2;
43 op_pixels_abs_func pix_abs8x8_y2;
44 op_pixels_abs_func pix_abs8x8_xy2;
45
46 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
47 UINT32 squareTbl[512];
48
49 extern INT16 default_intra_matrix[64];
50 extern INT16 default_non_intra_matrix[64];
51 extern INT16 ff_mpeg4_default_intra_matrix[64];
52 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
53
54 UINT8 zigzag_direct[64] = {
55 0, 1, 8, 16, 9, 2, 3, 10,
56 17, 24, 32, 25, 18, 11, 4, 5,
57 12, 19, 26, 33, 40, 48, 41, 34,
58 27, 20, 13, 6, 7, 14, 21, 28,
59 35, 42, 49, 56, 57, 50, 43, 36,
60 29, 22, 15, 23, 30, 37, 44, 51,
61 58, 59, 52, 45, 38, 31, 39, 46,
62 53, 60, 61, 54, 47, 55, 62, 63
63 };
64
65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66 UINT16 __align8 inv_zigzag_direct16[64];
67
68 /* not permutated zigzag_direct for MMX quantizer */
69 UINT8 zigzag_direct_noperm[64];
70
71 UINT8 ff_alternate_horizontal_scan[64] = {
72 0, 1, 2, 3, 8, 9, 16, 17,
73 10, 11, 4, 5, 6, 7, 15, 14,
74 13, 12, 19, 18, 24, 25, 32, 33,
75 26, 27, 20, 21, 22, 23, 28, 29,
76 30, 31, 34, 35, 40, 41, 48, 49,
77 42, 43, 36, 37, 38, 39, 44, 45,
78 46, 47, 50, 51, 56, 57, 58, 59,
79 52, 53, 54, 55, 60, 61, 62, 63,
80 };
81
82 UINT8 ff_alternate_vertical_scan[64] = {
83 0, 8, 16, 24, 1, 9, 2, 10,
84 17, 25, 32, 40, 48, 56, 57, 49,
85 41, 33, 26, 18, 3, 11, 4, 12,
86 19, 27, 34, 42, 50, 58, 35, 43,
87 51, 59, 20, 28, 5, 13, 6, 14,
88 21, 29, 36, 44, 52, 60, 37, 45,
89 53, 61, 22, 30, 7, 15, 23, 31,
90 38, 46, 54, 62, 39, 47, 55, 63,
91 };
92
93 #ifdef SIMPLE_IDCT
94
95 /* Input permutation for the simple_idct_mmx */
96 static UINT8 simple_mmx_permutation[64]={
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
105 };
106 #endif
107
108 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
109 UINT32 inverse[256]={
110 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
111 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
112 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
113 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
114 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
115 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
116 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
117 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
118 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
119 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
120 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
121 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
122 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
123 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
124 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
125 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
126 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
127 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
128 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
129 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
130 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
131 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
132 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
133 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
134 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
135 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
136 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
137 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
138 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
139 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
140 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
141 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
142 };
143
144 /* used to skip zeros at the end */
145 UINT8 zigzag_end[64];
146
147 UINT8 permutation[64];
148 //UINT8 invPermutation[64];
149
150 static void build_zigzag_end(void)
151 {
152 int lastIndex;
153 int lastIndexAfterPerm=0;
154 for(lastIndex=0; lastIndex<64; lastIndex++)
155 {
156 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
157 lastIndexAfterPerm= zigzag_direct[lastIndex];
158 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
159 }
160 }
161
162 void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
163 {
164 DCTELEM *p;
165 const UINT8 *pix;
166 int i;
167
168 /* read the pixels */
169 p = block;
170 pix = pixels;
171 for(i=0;i<8;i++) {
172 p[0] = pix[0];
173 p[1] = pix[1];
174 p[2] = pix[2];
175 p[3] = pix[3];
176 p[4] = pix[4];
177 p[5] = pix[5];
178 p[6] = pix[6];
179 p[7] = pix[7];
180 pix += line_size;
181 p += 8;
182 }
183 }
184
185 void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
186 DCTELEM *p;
187 int i;
188
189 /* read the pixels */
190 p = block;
191 for(i=0;i<8;i++) {
192 p[0] = s1[0] - s2[0];
193 p[1] = s1[1] - s2[1];
194 p[2] = s1[2] - s2[2];
195 p[3] = s1[3] - s2[3];
196 p[4] = s1[4] - s2[4];
197 p[5] = s1[5] - s2[5];
198 p[6] = s1[6] - s2[6];
199 p[7] = s1[7] - s2[7];
200 s1 += stride;
201 s2 += stride;
202 p += 8;
203 }
204 }
205
206
207 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
208 {
209 const DCTELEM *p;
210 UINT8 *pix;
211 int i;
212 UINT8 *cm = cropTbl + MAX_NEG_CROP;
213
214 /* read the pixels */
215 p = block;
216 pix = pixels;
217 for(i=0;i<8;i++) {
218 pix[0] = cm[p[0]];
219 pix[1] = cm[p[1]];
220 pix[2] = cm[p[2]];
221 pix[3] = cm[p[3]];
222 pix[4] = cm[p[4]];
223 pix[5] = cm[p[5]];
224 pix[6] = cm[p[6]];
225 pix[7] = cm[p[7]];
226 pix += line_size;
227 p += 8;
228 }
229 }
230
231 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
232 {
233 const DCTELEM *p;
234 UINT8 *pix;
235 int i;
236 UINT8 *cm = cropTbl + MAX_NEG_CROP;
237
238 /* read the pixels */
239 p = block;
240 pix = pixels;
241 for(i=0;i<8;i++) {
242 pix[0] = cm[pix[0] + p[0]];
243 pix[1] = cm[pix[1] + p[1]];
244 pix[2] = cm[pix[2] + p[2]];
245 pix[3] = cm[pix[3] + p[3]];
246 pix[4] = cm[pix[4] + p[4]];
247 pix[5] = cm[pix[5] + p[5]];
248 pix[6] = cm[pix[6] + p[6]];
249 pix[7] = cm[pix[7] + p[7]];
250 pix += line_size;
251 p += 8;
252 }
253 }
254
255 #ifdef __GNUC__
256
257 struct unaligned_64 { uint64_t l; } __attribute__((packed));
258 struct unaligned_32 { uint32_t l; } __attribute__((packed));
259
260 #define LD32(a) (((const struct unaligned_32 *) (a))->l)
261 #define LD64(a) (((const struct unaligned_64 *) (a))->l)
262
263 #else /* __GNUC__ */
264
265 #define LD32(a) (*((uint32_t*)(a)))
266 #define LD64(a) (*((uint64_t*)(a)))
267
268 #endif /* !__GNUC__ */
269
270 #if 0
271
272 #define PIXOP2(OPNAME, OP) \
273 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
274 {\
275 int i;\
276 for(i=0; i<h; i++){\
277 OP(*((uint64_t*)block), LD64(pixels));\
278 pixels+=line_size;\
279 block +=line_size;\
280 }\
281 }\
282 \
283 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
284 {\
285 int i;\
286 for(i=0; i<h; i++){\
287 const uint64_t a= LD64(pixels );\
288 const uint64_t b= LD64(pixels+1);\
289 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
290 pixels+=line_size;\
291 block +=line_size;\
292 }\
293 }\
294 \
295 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
296 {\
297 int i;\
298 for(i=0; i<h; i++){\
299 const uint64_t a= LD64(pixels );\
300 const uint64_t b= LD64(pixels+1);\
301 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
302 pixels+=line_size;\
303 block +=line_size;\
304 }\
305 }\
306 \
307 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
308 {\
309 int i;\
310 for(i=0; i<h; i++){\
311 const uint64_t a= LD64(pixels );\
312 const uint64_t b= LD64(pixels+line_size);\
313 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
314 pixels+=line_size;\
315 block +=line_size;\
316 }\
317 }\
318 \
319 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
320 {\
321 int i;\
322 for(i=0; i<h; i++){\
323 const uint64_t a= LD64(pixels );\
324 const uint64_t b= LD64(pixels+line_size);\
325 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
326 pixels+=line_size;\
327 block +=line_size;\
328 }\
329 }\
330 \
331 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
332 {\
333 int i;\
334 const uint64_t a= LD64(pixels );\
335 const uint64_t b= LD64(pixels+1);\
336 uint64_t l0= (a&0x0303030303030303ULL)\
337 + (b&0x0303030303030303ULL)\
338 + 0x0202020202020202ULL;\
339 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
340 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
341 uint64_t l1,h1;\
342 \
343 pixels+=line_size;\
344 for(i=0; i<h; i+=2){\
345 uint64_t a= LD64(pixels );\
346 uint64_t b= LD64(pixels+1);\
347 l1= (a&0x0303030303030303ULL)\
348 + (b&0x0303030303030303ULL);\
349 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
350 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
351 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
352 pixels+=line_size;\
353 block +=line_size;\
354 a= LD64(pixels );\
355 b= LD64(pixels+1);\
356 l0= (a&0x0303030303030303ULL)\
357 + (b&0x0303030303030303ULL)\
358 + 0x0202020202020202ULL;\
359 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
360 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
361 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
362 pixels+=line_size;\
363 block +=line_size;\
364 }\
365 }\
366 \
367 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
368 {\
369 int i;\
370 const uint64_t a= LD64(pixels );\
371 const uint64_t b= LD64(pixels+1);\
372 uint64_t l0= (a&0x0303030303030303ULL)\
373 + (b&0x0303030303030303ULL)\
374 + 0x0101010101010101ULL;\
375 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
376 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
377 uint64_t l1,h1;\
378 \
379 pixels+=line_size;\
380 for(i=0; i<h; i+=2){\
381 uint64_t a= LD64(pixels );\
382 uint64_t b= LD64(pixels+1);\
383 l1= (a&0x0303030303030303ULL)\
384 + (b&0x0303030303030303ULL);\
385 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
386 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
387 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
388 pixels+=line_size;\
389 block +=line_size;\
390 a= LD64(pixels );\
391 b= LD64(pixels+1);\
392 l0= (a&0x0303030303030303ULL)\
393 + (b&0x0303030303030303ULL)\
394 + 0x0101010101010101ULL;\
395 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
396 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
397 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
398 pixels+=line_size;\
399 block +=line_size;\
400 }\
401 }\
402 \
403 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
404 OPNAME ## _pixels,\
405 OPNAME ## _pixels_x2,\
406 OPNAME ## _pixels_y2,\
407 OPNAME ## _pixels_xy2,\
408 };\
409 \
410 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
411 OPNAME ## _pixels,\
412 OPNAME ## _no_rnd_pixels_x2,\
413 OPNAME ## _no_rnd_pixels_y2,\
414 OPNAME ## _no_rnd_pixels_xy2,\
415 };
416
417 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
418 #else // 64 bit variant
419
420 #define PIXOP2(OPNAME, OP) \
421 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
422 {\
423 int i;\
424 for(i=0; i<h; i++){\
425 OP(*((uint32_t*)(block )), LD32(pixels ));\
426 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
427 pixels+=line_size;\
428 block +=line_size;\
429 }\
430 }\
431 \
432 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
433 {\
434 int i;\
435 for(i=0; i<h; i++){\
436 int j;\
437 for(j=0; j<2; j++){\
438 const uint32_t a= LD32(pixels );\
439 const uint32_t b= LD32(pixels+1);\
440 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
441 pixels+=4;\
442 block +=4;\
443 }\
444 pixels+=line_size-8;\
445 block +=line_size-8;\
446 }\
447 }\
448 \
449 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
450 {\
451 int i;\
452 for(i=0; i<h; i++){\
453 int j;\
454 for(j=0; j<2; j++){\
455 const uint32_t a= LD32(pixels );\
456 const uint32_t b= LD32(pixels+1);\
457 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
458 pixels+=4;\
459 block +=4;\
460 }\
461 pixels+=line_size-8;\
462 block +=line_size-8;\
463 }\
464 }\
465 \
466 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
467 {\
468 int i;\
469 for(i=0; i<h; i++){\
470 int j;\
471 for(j=0; j<2; j++){\
472 const uint32_t a= LD32(pixels );\
473 const uint32_t b= LD32(pixels+line_size);\
474 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
475 pixels+=4;\
476 block +=4;\
477 }\
478 pixels+=line_size-8;\
479 block +=line_size-8;\
480 }\
481 }\
482 \
483 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
484 {\
485 int i;\
486 for(i=0; i<h; i++){\
487 int j;\
488 for(j=0; j<2; j++){\
489 const uint32_t a= LD32(pixels );\
490 const uint32_t b= LD32(pixels+line_size);\
491 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
492 pixels+=4;\
493 block +=4;\
494 }\
495 pixels+=line_size-8;\
496 block +=line_size-8;\
497 }\
498 }\
499 \
500 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
501 {\
502 int j;\
503 for(j=0; j<2; j++){\
504 int i;\
505 const uint32_t a= LD32(pixels );\
506 const uint32_t b= LD32(pixels+1);\
507 uint32_t l0= (a&0x03030303UL)\
508 + (b&0x03030303UL)\
509 + 0x02020202UL;\
510 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
511 + ((b&0xFCFCFCFCUL)>>2);\
512 uint32_t l1,h1;\
513 \
514 pixels+=line_size;\
515 for(i=0; i<h; i+=2){\
516 uint32_t a= LD32(pixels );\
517 uint32_t b= LD32(pixels+1);\
518 l1= (a&0x03030303UL)\
519 + (b&0x03030303UL);\
520 h1= ((a&0xFCFCFCFCUL)>>2)\
521 + ((b&0xFCFCFCFCUL)>>2);\
522 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
523 pixels+=line_size;\
524 block +=line_size;\
525 a= LD32(pixels );\
526 b= LD32(pixels+1);\
527 l0= (a&0x03030303UL)\
528 + (b&0x03030303UL)\
529 + 0x02020202UL;\
530 h0= ((a&0xFCFCFCFCUL)>>2)\
531 + ((b&0xFCFCFCFCUL)>>2);\
532 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
533 pixels+=line_size;\
534 block +=line_size;\
535 }\
536 pixels+=4-line_size*(h+1);\
537 block +=4-line_size*h;\
538 }\
539 }\
540 \
541 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
542 {\
543 int j;\
544 for(j=0; j<2; j++){\
545 int i;\
546 const uint32_t a= LD32(pixels );\
547 const uint32_t b= LD32(pixels+1);\
548 uint32_t l0= (a&0x03030303UL)\
549 + (b&0x03030303UL)\
550 + 0x01010101UL;\
551 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
552 + ((b&0xFCFCFCFCUL)>>2);\
553 uint32_t l1,h1;\
554 \
555 pixels+=line_size;\
556 for(i=0; i<h; i+=2){\
557 uint32_t a= LD32(pixels );\
558 uint32_t b= LD32(pixels+1);\
559 l1= (a&0x03030303UL)\
560 + (b&0x03030303UL);\
561 h1= ((a&0xFCFCFCFCUL)>>2)\
562 + ((b&0xFCFCFCFCUL)>>2);\
563 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
564 pixels+=line_size;\
565 block +=line_size;\
566 a= LD32(pixels );\
567 b= LD32(pixels+1);\
568 l0= (a&0x03030303UL)\
569 + (b&0x03030303UL)\
570 + 0x01010101UL;\
571 h0= ((a&0xFCFCFCFCUL)>>2)\
572 + ((b&0xFCFCFCFCUL)>>2);\
573 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
574 pixels+=line_size;\
575 block +=line_size;\
576 }\
577 pixels+=4-line_size*(h+1);\
578 block +=4-line_size*h;\
579 }\
580 }\
581 \
582 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
583 OPNAME ## _pixels,\
584 OPNAME ## _pixels_x2,\
585 OPNAME ## _pixels_y2,\
586 OPNAME ## _pixels_xy2,\
587 };\
588 \
589 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
590 OPNAME ## _pixels,\
591 OPNAME ## _no_rnd_pixels_x2,\
592 OPNAME ## _no_rnd_pixels_y2,\
593 OPNAME ## _no_rnd_pixels_xy2,\
594 };
595 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
596 #endif
597
598 #define op_put(a, b) a = b
599
600 PIXOP2(avg, op_avg)
601 PIXOP2(put, op_put)
602 #undef op_avg
603 #undef op_put
604
605 #if 0
606 /* FIXME this stuff could be removed as its ot really used anymore */
607 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
608 \
609 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
610 { \
611 BTYPE *p; \
612 const UINT8 *pix; \
613 \
614 p = block; \
615 pix = pixels; \
616 do { \
617 OP(p[0], pix[0]); \
618 OP(p[1], pix[1]); \
619 OP(p[2], pix[2]); \
620 OP(p[3], pix[3]); \
621 OP(p[4], pix[4]); \
622 OP(p[5], pix[5]); \
623 OP(p[6], pix[6]); \
624 OP(p[7], pix[7]); \
625 pix += line_size; \
626 p += INCR; \
627 } while (--h);; \
628 } \
629 \
630 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
631 { \
632 BTYPE *p; \
633 const UINT8 *pix; \
634 \
635 p = block; \
636 pix = pixels; \
637 do { \
638 OP(p[0], avg2(pix[0], pix[1])); \
639 OP(p[1], avg2(pix[1], pix[2])); \
640 OP(p[2], avg2(pix[2], pix[3])); \
641 OP(p[3], avg2(pix[3], pix[4])); \
642 OP(p[4], avg2(pix[4], pix[5])); \
643 OP(p[5], avg2(pix[5], pix[6])); \
644 OP(p[6], avg2(pix[6], pix[7])); \
645 OP(p[7], avg2(pix[7], pix[8])); \
646 pix += line_size; \
647 p += INCR; \
648 } while (--h); \
649 } \
650 \
651 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
652 { \
653 BTYPE *p; \
654 const UINT8 *pix; \
655 const UINT8 *pix1; \
656 \
657 p = block; \
658 pix = pixels; \
659 pix1 = pixels + line_size; \
660 do { \
661 OP(p[0], avg2(pix[0], pix1[0])); \
662 OP(p[1], avg2(pix[1], pix1[1])); \
663 OP(p[2], avg2(pix[2], pix1[2])); \
664 OP(p[3], avg2(pix[3], pix1[3])); \
665 OP(p[4], avg2(pix[4], pix1[4])); \
666 OP(p[5], avg2(pix[5], pix1[5])); \
667 OP(p[6], avg2(pix[6], pix1[6])); \
668 OP(p[7], avg2(pix[7], pix1[7])); \
669 pix += line_size; \
670 pix1 += line_size; \
671 p += INCR; \
672 } while(--h); \
673 } \
674 \
675 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
676 { \
677 BTYPE *p; \
678 const UINT8 *pix; \
679 const UINT8 *pix1; \
680 \
681 p = block; \
682 pix = pixels; \
683 pix1 = pixels + line_size; \
684 do { \
685 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
686 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
687 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
688 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
689 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
690 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
691 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
692 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
693 pix += line_size; \
694 pix1 += line_size; \
695 p += INCR; \
696 } while(--h); \
697 } \
698 \
699 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
700 OPNAME ## _pixels, \
701 OPNAME ## _pixels_x2, \
702 OPNAME ## _pixels_y2, \
703 OPNAME ## _pixels_xy2, \
704 };
705
706 /* rounding primitives */
707 #define avg2(a,b) ((a+b+1)>>1)
708 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
709
710 #define op_avg(a, b) a = avg2(a, b)
711 #define op_sub(a, b) a -= b
712
713 PIXOP(DCTELEM, sub, op_sub, 8)
714
715 /* not rounding primitives */
716 #undef avg2
717 #undef avg4
718 #define avg2(a,b) ((a+b)>>1)
719 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
720
721 /* motion estimation */
722
723 #undef avg2
724 #undef avg4
725 #endif
726
727 #define avg2(a,b) ((a+b+1)>>1)
728 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
729
730 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
731 {
732 const int A=(16-x16)*(16-y16);
733 const int B=( x16)*(16-y16);
734 const int C=(16-x16)*( y16);
735 const int D=( x16)*( y16);
736 int i;
737 rounder= 128 - rounder;
738
739 for(i=0; i<h; i++)
740 {
741 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
742 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
743 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
744 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
745 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
746 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
747 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
748 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
749 dst+= srcStride;
750 src+= srcStride;
751 }
752 }
753
754 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
755 {
756 UINT8 *cm = cropTbl + MAX_NEG_CROP;
757 int i;
758 for(i=0; i<h; i++)
759 {
760 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
761 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
762 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
763 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
764 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
765 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
766 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
767 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
768 dst+=dstStride;
769 src+=srcStride;
770 }
771 }
772
773 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
774 {
775 UINT8 *cm = cropTbl + MAX_NEG_CROP;
776 int i;
777 for(i=0; i<w; i++)
778 {
779 const int src0= src[0*srcStride];
780 const int src1= src[1*srcStride];
781 const int src2= src[2*srcStride];
782 const int src3= src[3*srcStride];
783 const int src4= src[4*srcStride];
784 const int src5= src[5*srcStride];
785 const int src6= src[6*srcStride];
786 const int src7= src[7*srcStride];
787 const int src8= src[8*srcStride];
788 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
789 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
790 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
791 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
792 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
793 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
794 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
795 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
796 dst++;
797 src++;
798 }
799 }
800
801 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
802 {
803 int i;
804 for(i=0; i<8; i++)
805 {
806 dst[0]= src[0];
807 dst[1]= src[1];
808 dst[2]= src[2];
809 dst[3]= src[3];
810 dst[4]= src[4];
811 dst[5]= src[5];
812 dst[6]= src[6];
813 dst[7]= src[7];
814 dst+=dstStride;
815 src+=srcStride;
816 }
817 }
818
819 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
820 {
821 int i;
822 for(i=0; i<8; i++)
823 {
824 dst[0]= (src1[0] + src2[0] + r)>>1;
825 dst[1]= (src1[1] + src2[1] + r)>>1;
826 dst[2]= (src1[2] + src2[2] + r)>>1;
827 dst[3]= (src1[3] + src2[3] + r)>>1;
828 dst[4]= (src1[4] + src2[4] + r)>>1;
829 dst[5]= (src1[5] + src2[5] + r)>>1;
830 dst[6]= (src1[6] + src2[6] + r)>>1;
831 dst[7]= (src1[7] + src2[7] + r)>>1;
832 dst+=dstStride;
833 src1+=srcStride;
834 src2+=8;
835 }
836 }
837
838 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
839 {
840 int i;
841 for(i=0; i<8; i++)
842 {
843 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
844 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
845 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
846 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
847 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
848 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
849 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
850 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
851 dst+=dstStride;
852 src1+=srcStride;
853 src2+=8;
854 src3+=8;
855 src4+=8;
856 }
857 }
858
859 #define QPEL_MC(r, name) \
860 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
861 {\
862 put_block(dst, src, dstStride, srcStride);\
863 }\
864 \
865 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
866 {\
867 UINT8 half[64];\
868 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
869 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
870 }\
871 \
872 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
873 {\
874 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
875 }\
876 \
877 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
878 {\
879 UINT8 half[64];\
880 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
881 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
882 }\
883 \
884 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
885 {\
886 UINT8 half[64];\
887 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
888 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
889 }\
890 \
891 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
892 {\
893 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
894 }\
895 \
896 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
897 {\
898 UINT8 half[64];\
899 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
900 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
901 }\
902 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
903 {\
904 UINT8 halfH[72];\
905 UINT8 halfV[64];\
906 UINT8 halfHV[64];\
907 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
908 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
909 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
910 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
911 }\
912 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
913 {\
914 UINT8 halfH[72];\
915 UINT8 halfV[64];\
916 UINT8 halfHV[64];\
917 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
918 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
919 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
920 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
921 }\
922 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
923 {\
924 UINT8 halfH[72];\
925 UINT8 halfV[64];\
926 UINT8 halfHV[64];\
927 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
928 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
929 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
930 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
931 }\
932 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
933 {\
934 UINT8 halfH[72];\
935 UINT8 halfV[64];\
936 UINT8 halfHV[64];\
937 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
938 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
939 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
940 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
941 }\
942 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
943 {\
944 UINT8 halfH[72];\
945 UINT8 halfHV[64];\
946 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
947 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
948 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
949 }\
950 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
951 {\
952 UINT8 halfH[72];\
953 UINT8 halfHV[64];\
954 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
955 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
956 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
957 }\
958 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
959 {\
960 UINT8 halfH[72];\
961 UINT8 halfV[64];\
962 UINT8 halfHV[64];\
963 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
964 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
965 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
966 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
967 }\
968 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
969 {\
970 UINT8 halfH[72];\
971 UINT8 halfV[64];\
972 UINT8 halfHV[64];\
973 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
974 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
975 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
976 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
977 }\
978 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
979 {\
980 UINT8 halfH[72];\
981 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
982 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
983 }\
984 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
985 qpel_mc00_c ## name, \
986 qpel_mc10_c ## name, \
987 qpel_mc20_c ## name, \
988 qpel_mc30_c ## name, \
989 qpel_mc01_c ## name, \
990 qpel_mc11_c ## name, \
991 qpel_mc21_c ## name, \
992 qpel_mc31_c ## name, \
993 qpel_mc02_c ## name, \
994 qpel_mc12_c ## name, \
995 qpel_mc22_c ## name, \
996 qpel_mc32_c ## name, \
997 qpel_mc03_c ## name, \
998 qpel_mc13_c ## name, \
999 qpel_mc23_c ## name, \
1000 qpel_mc33_c ## name, \
1001 };
1002
1003 QPEL_MC(0, _rnd)
1004 QPEL_MC(1, _no_rnd)
1005
1006 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1007 {
1008 int s, i;
1009
1010 s = 0;
1011 for(i=0;i<16;i++) {
1012 s += abs(pix1[0] - pix2[0]);
1013 s += abs(pix1[1] - pix2[1]);
1014 s += abs(pix1[2] - pix2[2]);
1015 s += abs(pix1[3] - pix2[3]);
1016 s += abs(pix1[4] - pix2[4]);
1017 s += abs(pix1[5] - pix2[5]);
1018 s += abs(pix1[6] - pix2[6]);
1019 s += abs(pix1[7] - pix2[7]);
1020 s += abs(pix1[8] - pix2[8]);
1021 s += abs(pix1[9] - pix2[9]);
1022 s += abs(pix1[10] - pix2[10]);
1023 s += abs(pix1[11] - pix2[11]);
1024 s += abs(pix1[12] - pix2[12]);
1025 s += abs(pix1[13] - pix2[13]);
1026 s += abs(pix1[14] - pix2[14]);
1027 s += abs(pix1[15] - pix2[15]);
1028 pix1 += line_size;
1029 pix2 += line_size;
1030 }
1031 return s;
1032 }
1033
1034 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1035 {
1036 int s, i;
1037
1038 s = 0;
1039 for(i=0;i<16;i++) {
1040 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1041 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1042 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1043 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1044 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1045 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1046 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1047 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1048 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1049 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1050 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1051 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1052 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1053 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1054 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1055 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1056 pix1 += line_size;
1057 pix2 += line_size;
1058 }
1059 return s;
1060 }
1061
1062 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1063 {
1064 int s, i;
1065 UINT8 *pix3 = pix2 + line_size;
1066
1067 s = 0;
1068 for(i=0;i<16;i++) {
1069 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1070 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1071 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1072 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1073 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1074 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1075 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1076 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1077 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1078 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1079 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1080 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1081 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1082 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1083 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1084 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1085 pix1 += line_size;
1086 pix2 += line_size;
1087 pix3 += line_size;
1088 }
1089 return s;
1090 }
1091
1092 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1093 {
1094 int s, i;
1095 UINT8 *pix3 = pix2 + line_size;
1096
1097 s = 0;
1098 for(i=0;i<16;i++) {
1099 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1100 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1101 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1102 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1103 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1104 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1105 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1106 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1107 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1108 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1109 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1110 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1111 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1112 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1113 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1114 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1115 pix1 += line_size;
1116 pix2 += line_size;
1117 pix3 += line_size;
1118 }
1119 return s;
1120 }
1121
1122 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1123 {
1124 int s, i;
1125
1126 s = 0;
1127 for(i=0;i<8;i++) {
1128 s += abs(pix1[0] - pix2[0]);
1129 s += abs(pix1[1] - pix2[1]);
1130 s += abs(pix1[2] - pix2[2]);
1131 s += abs(pix1[3] - pix2[3]);
1132 s += abs(pix1[4] - pix2[4]);
1133 s += abs(pix1[5] - pix2[5]);
1134 s += abs(pix1[6] - pix2[6]);
1135 s += abs(pix1[7] - pix2[7]);
1136 pix1 += line_size;
1137 pix2 += line_size;
1138 }
1139 return s;
1140 }
1141
1142 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1143 {
1144 int s, i;
1145
1146 s = 0;
1147 for(i=0;i<8;i++) {
1148 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1149 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1150 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1151 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1152 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1153 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1154 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1155 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1156 pix1 += line_size;
1157 pix2 += line_size;
1158 }
1159 return s;
1160 }
1161
1162 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1163 {
1164 int s, i;
1165 UINT8 *pix3 = pix2 + line_size;
1166
1167 s = 0;
1168 for(i=0;i<8;i++) {
1169 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1170 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1171 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1172 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1173 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1174 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1175 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1176 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1177 pix1 += line_size;
1178 pix2 += line_size;
1179 pix3 += line_size;
1180 }
1181 return s;
1182 }
1183
1184 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1185 {
1186 int s, i;
1187 UINT8 *pix3 = pix2 + line_size;
1188
1189 s = 0;
1190 for(i=0;i<8;i++) {
1191 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1192 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1193 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1194 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1195 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1196 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1197 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1198 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1199 pix1 += line_size;
1200 pix2 += line_size;
1201 pix3 += line_size;
1202 }
1203 return s;
1204 }
1205
1206 /* permute block according so that it corresponds to the MMX idct
1207 order */
1208 #ifdef SIMPLE_IDCT
1209 /* general permutation, but perhaps slightly slower */
1210 void block_permute(INT16 *block)
1211 {
1212 int i;
1213 INT16 temp[64];
1214
1215 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1216
1217 for(i=0; i<64; i++) block[i] = temp[i];
1218 }
1219 #else
1220
1221 void block_permute(INT16 *block)
1222 {
1223 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1224 int i;
1225
1226 for(i=0;i<8;i++) {
1227 tmp1 = block[1];
1228 tmp2 = block[2];
1229 tmp3 = block[3];
1230 tmp4 = block[4];
1231 tmp5 = block[5];
1232 tmp6 = block[6];
1233 block[1] = tmp2;
1234 block[2] = tmp4;
1235 block[3] = tmp6;
1236 block[4] = tmp1;
1237 block[5] = tmp3;
1238 block[6] = tmp5;
1239 block += 8;
1240 }
1241 }
1242 #endif
1243
1244 void clear_blocks_c(DCTELEM *blocks)
1245 {
1246 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1247 }
1248
1249 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1250 converted */
1251 void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1252 {
1253 ff_idct (block);
1254 put_pixels_clamped(block, dest, line_size);
1255 }
1256
1257 void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1258 {
1259 ff_idct (block);
1260 add_pixels_clamped(block, dest, line_size);
1261 }
1262
1263 void dsputil_init(void)
1264 {
1265 int i, j;
1266 int use_permuted_idct;
1267
1268 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1269 for(i=0;i<MAX_NEG_CROP;i++) {
1270 cropTbl[i] = 0;
1271 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1272 }
1273
1274 for(i=0;i<512;i++) {
1275 squareTbl[i] = (i - 256) * (i - 256);
1276 }
1277
1278 #ifdef SIMPLE_IDCT
1279 ff_idct = NULL;
1280 #else
1281 ff_idct = j_rev_dct;
1282 #endif
1283 get_pixels = get_pixels_c;
1284 diff_pixels = diff_pixels_c;
1285 put_pixels_clamped = put_pixels_clamped_c;
1286 add_pixels_clamped = add_pixels_clamped_c;
1287 gmc1= gmc1_c;
1288 clear_blocks= clear_blocks_c;
1289
1290 pix_abs16x16 = pix_abs16x16_c;
1291 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1292 pix_abs16x16_y2 = pix_abs16x16_y2_c;
1293 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1294 pix_abs8x8 = pix_abs8x8_c;
1295 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1296 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1297 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1298 av_fdct = fdct_ifast;
1299
1300 use_permuted_idct = 1;
1301
1302 #ifdef HAVE_MMX
1303 dsputil_init_mmx();
1304 #endif
1305 #ifdef ARCH_ARMV4L
1306 dsputil_init_armv4l();
1307 #endif
1308 #ifdef HAVE_MLIB
1309 dsputil_init_mlib();
1310 use_permuted_idct = 0;
1311 #endif
1312 #ifdef ARCH_ALPHA
1313 dsputil_init_alpha();
1314 use_permuted_idct = 0;
1315 #endif
1316
1317 #ifdef SIMPLE_IDCT
1318 if (ff_idct == NULL) {
1319 ff_idct_put = simple_idct_put;
1320 ff_idct_add = simple_idct_add;
1321 use_permuted_idct=0;
1322 } else {
1323 ff_idct_put = gen_idct_put;
1324 ff_idct_add = gen_idct_add;
1325 }
1326 #endif
1327
1328 if(use_permuted_idct)
1329 #ifdef SIMPLE_IDCT
1330 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1331 #else
1332 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1333 #endif
1334 else
1335 for(i=0; i<64; i++) permutation[i]=i;
1336
1337 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1338 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1339
1340 if (use_permuted_idct) {
1341 /* permute for IDCT */
1342 for(i=0;i<64;i++) {
1343 j = zigzag_direct[i];
1344 zigzag_direct[i] = block_permute_op(j);
1345 j = ff_alternate_horizontal_scan[i];
1346 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1347 j = ff_alternate_vertical_scan[i];
1348 ff_alternate_vertical_scan[i] = block_permute_op(j);
1349 }
1350 block_permute(default_intra_matrix);
1351 block_permute(default_non_intra_matrix);
1352 block_permute(ff_mpeg4_default_intra_matrix);
1353 block_permute(ff_mpeg4_default_non_intra_matrix);
1354 }
1355
1356 build_zigzag_end();
1357 }
1358
1359 /* remove any non bit exact operation (testing purpose) */
1360 void avcodec_set_bit_exact(void)
1361 {
1362 #ifdef HAVE_MMX
1363 dsputil_set_bit_exact_mmx();
1364 #endif
1365 }
1366
1367 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1368 int orig_linesize[3], int coded_linesize,
1369 AVCodecContext *avctx)
1370 {
1371 int quad, diff, x, y;
1372 UINT8 *orig, *coded;
1373 UINT32 *sq = squareTbl + 256;
1374
1375 quad = 0;
1376 diff = 0;
1377
1378 /* Luminance */
1379 orig = orig_image[0];
1380 coded = coded_image[0];
1381
1382 for (y=0;y<avctx->height;y++) {
1383 for (x=0;x<avctx->width;x++) {
1384 diff = *(orig + x) - *(coded + x);
1385 quad += sq[diff];
1386 }
1387 orig += orig_linesize[0];
1388 coded += coded_linesize;
1389 }
1390
1391 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1392
1393 if (avctx->psnr_y) {
1394 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1395 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1396 } else
1397 avctx->psnr_y = 99.99;
1398 }
1399