consistent include usage
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
d962f6fd 23#include "simple_idct.h"
de6d9b64 24
4af7bcc1 25void (*ff_idct)(DCTELEM *block);
de6d9b64 26void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
9dbcbd92 27void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
de6d9b64
FB
28void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
29void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
44eb4951 30void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
649c00c9 31void (*clear_blocks)(DCTELEM *blocks);
de6d9b64
FB
32
33op_pixels_abs_func pix_abs16x16;
34op_pixels_abs_func pix_abs16x16_x2;
35op_pixels_abs_func pix_abs16x16_y2;
36op_pixels_abs_func pix_abs16x16_xy2;
37
ba6802de
MN
38op_pixels_abs_func pix_abs8x8;
39op_pixels_abs_func pix_abs8x8_x2;
40op_pixels_abs_func pix_abs8x8_y2;
41op_pixels_abs_func pix_abs8x8_xy2;
42
0cfa9713 43UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
44UINT32 squareTbl[512];
45
e0eac44e
FB
46extern UINT16 default_intra_matrix[64];
47extern UINT16 default_non_intra_matrix[64];
3bf43d42
MN
48extern UINT16 ff_mpeg4_default_intra_matrix[64];
49extern UINT16 ff_mpeg4_default_non_intra_matrix[64];
e0eac44e
FB
50
51UINT8 zigzag_direct[64] = {
52 0, 1, 8, 16, 9, 2, 3, 10,
53 17, 24, 32, 25, 18, 11, 4, 5,
54 12, 19, 26, 33, 40, 48, 41, 34,
55 27, 20, 13, 6, 7, 14, 21, 28,
56 35, 42, 49, 56, 57, 50, 43, 36,
57 29, 22, 15, 23, 30, 37, 44, 51,
58 58, 59, 52, 45, 38, 31, 39, 46,
59 53, 60, 61, 54, 47, 55, 62, 63
60};
61
2f349de2
MN
62/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
63UINT16 __align8 inv_zigzag_direct16[64];
64
65/* not permutated zigzag_direct for MMX quantizer */
66UINT8 zigzag_direct_noperm[64];
67
e0eac44e
FB
68UINT8 ff_alternate_horizontal_scan[64] = {
69 0, 1, 2, 3, 8, 9, 16, 17,
70 10, 11, 4, 5, 6, 7, 15, 14,
71 13, 12, 19, 18, 24, 25, 32, 33,
72 26, 27, 20, 21, 22, 23, 28, 29,
73 30, 31, 34, 35, 40, 41, 48, 49,
74 42, 43, 36, 37, 38, 39, 44, 45,
75 46, 47, 50, 51, 56, 57, 58, 59,
76 52, 53, 54, 55, 60, 61, 62, 63,
77};
78
79UINT8 ff_alternate_vertical_scan[64] = {
80 0, 8, 16, 24, 1, 9, 2, 10,
81 17, 25, 32, 40, 48, 56, 57, 49,
82 41, 33, 26, 18, 3, 11, 4, 12,
83 19, 27, 34, 42, 50, 58, 35, 43,
84 51, 59, 20, 28, 5, 13, 6, 14,
85 21, 29, 36, 44, 52, 60, 37, 45,
86 53, 61, 22, 30, 7, 15, 23, 31,
87 38, 46, 54, 62, 39, 47, 55, 63,
88};
89
e4986da9
J
90#ifdef SIMPLE_IDCT
91
0a8d8945 92/* Input permutation for the simple_idct_mmx */
5a240838 93static UINT8 simple_mmx_permutation[64]={
0a8d8945
MN
94 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
95 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
96 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
97 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
98 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
99 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
100 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
101 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
5a240838 102};
e4986da9 103#endif
5a240838 104
2f349de2
MN
105/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
106UINT32 inverse[256]={
107 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
108 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
109 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
110 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
111 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
112 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
113 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
114 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
115 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
116 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
117 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
118 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
119 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
120 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
121 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
122 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
123 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
124 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
125 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
126 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
127 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
128 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
129 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
130 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
131 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
132 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
133 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
134 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
135 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
136 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
137 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
138 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
139};
140
badaf88e
MN
141/* used to skip zeros at the end */
142UINT8 zigzag_end[64];
143
5a240838
MN
144UINT8 permutation[64];
145//UINT8 invPermutation[64];
146
badaf88e
MN
147static void build_zigzag_end()
148{
149 int lastIndex;
150 int lastIndexAfterPerm=0;
151 for(lastIndex=0; lastIndex<64; lastIndex++)
152 {
153 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
154 lastIndexAfterPerm= zigzag_direct[lastIndex];
155 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
156 }
157}
158
de6d9b64
FB
159void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
160{
161 DCTELEM *p;
162 const UINT8 *pix;
163 int i;
164
165 /* read the pixels */
166 p = block;
167 pix = pixels;
168 for(i=0;i<8;i++) {
169 p[0] = pix[0];
170 p[1] = pix[1];
171 p[2] = pix[2];
172 p[3] = pix[3];
173 p[4] = pix[4];
174 p[5] = pix[5];
175 p[6] = pix[6];
176 p[7] = pix[7];
177 pix += line_size;
178 p += 8;
179 }
180}
181
9dbcbd92
MN
182void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
183 DCTELEM *p;
184 int i;
185
186 /* read the pixels */
187 p = block;
188 for(i=0;i<8;i++) {
189 p[0] = s1[0] - s2[0];
190 p[1] = s1[1] - s2[1];
191 p[2] = s1[2] - s2[2];
192 p[3] = s1[3] - s2[3];
193 p[4] = s1[4] - s2[4];
194 p[5] = s1[5] - s2[5];
195 p[6] = s1[6] - s2[6];
196 p[7] = s1[7] - s2[7];
197 s1 += stride;
198 s2 += stride;
199 p += 8;
200 }
201}
202
203
de6d9b64
FB
204void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
205{
206 const DCTELEM *p;
207 UINT8 *pix;
208 int i;
209 UINT8 *cm = cropTbl + MAX_NEG_CROP;
210
211 /* read the pixels */
212 p = block;
213 pix = pixels;
214 for(i=0;i<8;i++) {
215 pix[0] = cm[p[0]];
216 pix[1] = cm[p[1]];
217 pix[2] = cm[p[2]];
218 pix[3] = cm[p[3]];
219 pix[4] = cm[p[4]];
220 pix[5] = cm[p[5]];
221 pix[6] = cm[p[6]];
222 pix[7] = cm[p[7]];
223 pix += line_size;
224 p += 8;
225 }
226}
227
228void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
229{
230 const DCTELEM *p;
231 UINT8 *pix;
232 int i;
233 UINT8 *cm = cropTbl + MAX_NEG_CROP;
234
235 /* read the pixels */
236 p = block;
237 pix = pixels;
238 for(i=0;i<8;i++) {
239 pix[0] = cm[pix[0] + p[0]];
240 pix[1] = cm[pix[1] + p[1]];
241 pix[2] = cm[pix[2] + p[2]];
242 pix[3] = cm[pix[3] + p[3]];
243 pix[4] = cm[pix[4] + p[4]];
244 pix[5] = cm[pix[5] + p[5]];
245 pix[6] = cm[pix[6] + p[6]];
246 pix[7] = cm[pix[7] + p[7]];
247 pix += line_size;
248 p += 8;
249 }
250}
251
10fc8424
MN
252#ifdef __GNUC__
253
254struct unaligned_64 { uint64_t l; } __attribute__((packed));
255struct unaligned_32 { uint32_t l; } __attribute__((packed));
256
257#define LD32(a) (((const struct unaligned_32 *) (a))->l)
258#define LD64(a) (((const struct unaligned_64 *) (a))->l)
259
260#else /* __GNUC__ */
59fe111e
MN
261
262#define LD32(a) (*((uint32_t*)(a)))
263#define LD64(a) (*((uint64_t*)(a)))
264
10fc8424
MN
265#endif /* !__GNUC__ */
266
59fe111e
MN
267#if 0
268
269#define PIXOP2(OPNAME, OP) \
270void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
271{\
272 int i;\
273 for(i=0; i<h; i++){\
274 OP(*((uint64_t*)block), LD64(pixels));\
275 pixels+=line_size;\
276 block +=line_size;\
277 }\
278}\
279\
280void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
281{\
282 int i;\
283 for(i=0; i<h; i++){\
284 const uint64_t a= LD64(pixels );\
285 const uint64_t b= LD64(pixels+1);\
286 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
287 pixels+=line_size;\
288 block +=line_size;\
289 }\
290}\
291\
292void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
293{\
294 int i;\
295 for(i=0; i<h; i++){\
296 const uint64_t a= LD64(pixels );\
297 const uint64_t b= LD64(pixels+1);\
298 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
299 pixels+=line_size;\
300 block +=line_size;\
301 }\
302}\
303\
304void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
305{\
306 int i;\
307 for(i=0; i<h; i++){\
308 const uint64_t a= LD64(pixels );\
309 const uint64_t b= LD64(pixels+line_size);\
310 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
311 pixels+=line_size;\
312 block +=line_size;\
313 }\
314}\
315\
316void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
317{\
318 int i;\
319 for(i=0; i<h; i++){\
320 const uint64_t a= LD64(pixels );\
321 const uint64_t b= LD64(pixels+line_size);\
322 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
323 pixels+=line_size;\
324 block +=line_size;\
325 }\
326}\
327\
328void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
329{\
330 int i;\
331 const uint64_t a= LD64(pixels );\
332 const uint64_t b= LD64(pixels+1);\
333 uint64_t l0= (a&0x0303030303030303ULL)\
334 + (b&0x0303030303030303ULL)\
335 + 0x0202020202020202ULL;\
336 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
337 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
338 uint64_t l1,h1;\
339\
340 pixels+=line_size;\
341 for(i=0; i<h; i+=2){\
342 uint64_t a= LD64(pixels );\
343 uint64_t b= LD64(pixels+1);\
344 l1= (a&0x0303030303030303ULL)\
345 + (b&0x0303030303030303ULL);\
346 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
347 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
348 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
349 pixels+=line_size;\
350 block +=line_size;\
351 a= LD64(pixels );\
352 b= LD64(pixels+1);\
353 l0= (a&0x0303030303030303ULL)\
354 + (b&0x0303030303030303ULL)\
355 + 0x0202020202020202ULL;\
356 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
357 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
358 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
359 pixels+=line_size;\
360 block +=line_size;\
361 }\
362}\
363\
364void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
365{\
366 int i;\
367 const uint64_t a= LD64(pixels );\
368 const uint64_t b= LD64(pixels+1);\
369 uint64_t l0= (a&0x0303030303030303ULL)\
370 + (b&0x0303030303030303ULL)\
371 + 0x0101010101010101ULL;\
372 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
373 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
374 uint64_t l1,h1;\
375\
376 pixels+=line_size;\
377 for(i=0; i<h; i+=2){\
378 uint64_t a= LD64(pixels );\
379 uint64_t b= LD64(pixels+1);\
380 l1= (a&0x0303030303030303ULL)\
381 + (b&0x0303030303030303ULL);\
382 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
383 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
384 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
385 pixels+=line_size;\
386 block +=line_size;\
387 a= LD64(pixels );\
388 b= LD64(pixels+1);\
389 l0= (a&0x0303030303030303ULL)\
390 + (b&0x0303030303030303ULL)\
391 + 0x0101010101010101ULL;\
392 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
393 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
394 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
395 pixels+=line_size;\
396 block +=line_size;\
397 }\
398}\
399\
400void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
401 OPNAME ## _pixels,\
402 OPNAME ## _pixels_x2,\
403 OPNAME ## _pixels_y2,\
404 OPNAME ## _pixels_xy2,\
405};\
406\
407void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
408 OPNAME ## _pixels,\
409 OPNAME ## _no_rnd_pixels_x2,\
410 OPNAME ## _no_rnd_pixels_y2,\
411 OPNAME ## _no_rnd_pixels_xy2,\
412};
413
414#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
415#else // 64 bit variant
416
417#define PIXOP2(OPNAME, OP) \
418void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
419{\
420 int i;\
421 for(i=0; i<h; i++){\
422 OP(*((uint32_t*)(block )), LD32(pixels ));\
423 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
424 pixels+=line_size;\
425 block +=line_size;\
426 }\
427}\
428\
429void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
430{\
431 int i;\
432 for(i=0; i<h; i++){\
433 int j;\
434 for(j=0; j<2; j++){\
435 const uint32_t a= LD32(pixels );\
436 const uint32_t b= LD32(pixels+1);\
437 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
438 pixels+=4;\
439 block +=4;\
440 }\
441 pixels+=line_size-8;\
442 block +=line_size-8;\
443 }\
444}\
445\
446void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
447{\
448 int i;\
449 for(i=0; i<h; i++){\
450 int j;\
451 for(j=0; j<2; j++){\
452 const uint32_t a= LD32(pixels );\
453 const uint32_t b= LD32(pixels+1);\
454 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
455 pixels+=4;\
456 block +=4;\
457 }\
458 pixels+=line_size-8;\
459 block +=line_size-8;\
460 }\
461}\
462\
463void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
464{\
465 int i;\
466 for(i=0; i<h; i++){\
467 int j;\
468 for(j=0; j<2; j++){\
469 const uint32_t a= LD32(pixels );\
470 const uint32_t b= LD32(pixels+line_size);\
471 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
472 pixels+=4;\
473 block +=4;\
474 }\
475 pixels+=line_size-8;\
476 block +=line_size-8;\
477 }\
478}\
479\
480void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
481{\
482 int i;\
483 for(i=0; i<h; i++){\
484 int j;\
485 for(j=0; j<2; j++){\
486 const uint32_t a= LD32(pixels );\
487 const uint32_t b= LD32(pixels+line_size);\
488 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
489 pixels+=4;\
490 block +=4;\
491 }\
492 pixels+=line_size-8;\
493 block +=line_size-8;\
494 }\
495}\
496\
497void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
498{\
499 int j;\
500 for(j=0; j<2; j++){\
501 int i;\
502 const uint32_t a= LD32(pixels );\
503 const uint32_t b= LD32(pixels+1);\
504 uint32_t l0= (a&0x03030303UL)\
505 + (b&0x03030303UL)\
506 + 0x02020202UL;\
507 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
508 + ((b&0xFCFCFCFCUL)>>2);\
509 uint32_t l1,h1;\
510\
511 pixels+=line_size;\
512 for(i=0; i<h; i+=2){\
513 uint32_t a= LD32(pixels );\
514 uint32_t b= LD32(pixels+1);\
515 l1= (a&0x03030303UL)\
516 + (b&0x03030303UL);\
517 h1= ((a&0xFCFCFCFCUL)>>2)\
518 + ((b&0xFCFCFCFCUL)>>2);\
519 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
520 pixels+=line_size;\
521 block +=line_size;\
522 a= LD32(pixels );\
523 b= LD32(pixels+1);\
524 l0= (a&0x03030303UL)\
525 + (b&0x03030303UL)\
526 + 0x02020202UL;\
527 h0= ((a&0xFCFCFCFCUL)>>2)\
528 + ((b&0xFCFCFCFCUL)>>2);\
529 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
530 pixels+=line_size;\
531 block +=line_size;\
532 }\
533 pixels+=4-line_size*(h+1);\
534 block +=4-line_size*h;\
535 }\
536}\
537\
538void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
539{\
540 int j;\
541 for(j=0; j<2; j++){\
542 int i;\
543 const uint32_t a= LD32(pixels );\
544 const uint32_t b= LD32(pixels+1);\
545 uint32_t l0= (a&0x03030303UL)\
546 + (b&0x03030303UL)\
547 + 0x01010101UL;\
548 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
549 + ((b&0xFCFCFCFCUL)>>2);\
550 uint32_t l1,h1;\
551\
552 pixels+=line_size;\
553 for(i=0; i<h; i+=2){\
554 uint32_t a= LD32(pixels );\
555 uint32_t b= LD32(pixels+1);\
556 l1= (a&0x03030303UL)\
557 + (b&0x03030303UL);\
558 h1= ((a&0xFCFCFCFCUL)>>2)\
559 + ((b&0xFCFCFCFCUL)>>2);\
560 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
561 pixels+=line_size;\
562 block +=line_size;\
563 a= LD32(pixels );\
564 b= LD32(pixels+1);\
565 l0= (a&0x03030303UL)\
566 + (b&0x03030303UL)\
567 + 0x01010101UL;\
568 h0= ((a&0xFCFCFCFCUL)>>2)\
569 + ((b&0xFCFCFCFCUL)>>2);\
570 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
571 pixels+=line_size;\
572 block +=line_size;\
573 }\
574 pixels+=4-line_size*(h+1);\
575 block +=4-line_size*h;\
576 }\
577}\
578\
579void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
580 OPNAME ## _pixels,\
581 OPNAME ## _pixels_x2,\
582 OPNAME ## _pixels_y2,\
583 OPNAME ## _pixels_xy2,\
584};\
585\
586void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
587 OPNAME ## _pixels,\
588 OPNAME ## _no_rnd_pixels_x2,\
589 OPNAME ## _no_rnd_pixels_y2,\
590 OPNAME ## _no_rnd_pixels_xy2,\
591};
592#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
593#endif
594
595#define op_put(a, b) a = b
596
597PIXOP2(avg, op_avg)
598PIXOP2(put, op_put)
599#undef op_avg
600#undef op_put
601
57060b1e 602#if 0
59fe111e 603/* FIXME this stuff could be removed as its ot really used anymore */
de6d9b64
FB
604#define PIXOP(BTYPE, OPNAME, OP, INCR) \
605 \
606static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
607{ \
608 BTYPE *p; \
609 const UINT8 *pix; \
610 \
611 p = block; \
612 pix = pixels; \
613 do { \
614 OP(p[0], pix[0]); \
615 OP(p[1], pix[1]); \
616 OP(p[2], pix[2]); \
617 OP(p[3], pix[3]); \
618 OP(p[4], pix[4]); \
619 OP(p[5], pix[5]); \
620 OP(p[6], pix[6]); \
621 OP(p[7], pix[7]); \
622 pix += line_size; \
623 p += INCR; \
624 } while (--h);; \
625} \
626 \
627static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
628{ \
629 BTYPE *p; \
630 const UINT8 *pix; \
631 \
632 p = block; \
633 pix = pixels; \
634 do { \
635 OP(p[0], avg2(pix[0], pix[1])); \
636 OP(p[1], avg2(pix[1], pix[2])); \
637 OP(p[2], avg2(pix[2], pix[3])); \
638 OP(p[3], avg2(pix[3], pix[4])); \
639 OP(p[4], avg2(pix[4], pix[5])); \
640 OP(p[5], avg2(pix[5], pix[6])); \
641 OP(p[6], avg2(pix[6], pix[7])); \
642 OP(p[7], avg2(pix[7], pix[8])); \
643 pix += line_size; \
644 p += INCR; \
645 } while (--h); \
646} \
647 \
648static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
649{ \
650 BTYPE *p; \
651 const UINT8 *pix; \
652 const UINT8 *pix1; \
653 \
654 p = block; \
655 pix = pixels; \
656 pix1 = pixels + line_size; \
657 do { \
658 OP(p[0], avg2(pix[0], pix1[0])); \
659 OP(p[1], avg2(pix[1], pix1[1])); \
660 OP(p[2], avg2(pix[2], pix1[2])); \
661 OP(p[3], avg2(pix[3], pix1[3])); \
662 OP(p[4], avg2(pix[4], pix1[4])); \
663 OP(p[5], avg2(pix[5], pix1[5])); \
664 OP(p[6], avg2(pix[6], pix1[6])); \
665 OP(p[7], avg2(pix[7], pix1[7])); \
666 pix += line_size; \
667 pix1 += line_size; \
668 p += INCR; \
669 } while(--h); \
670} \
671 \
672static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
673{ \
674 BTYPE *p; \
675 const UINT8 *pix; \
676 const UINT8 *pix1; \
677 \
678 p = block; \
679 pix = pixels; \
680 pix1 = pixels + line_size; \
681 do { \
682 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
683 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
684 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
685 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
686 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
687 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
688 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
689 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
690 pix += line_size; \
691 pix1 += line_size; \
692 p += INCR; \
693 } while(--h); \
694} \
695 \
696void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
697 OPNAME ## _pixels, \
698 OPNAME ## _pixels_x2, \
699 OPNAME ## _pixels_y2, \
700 OPNAME ## _pixels_xy2, \
701};
702
de6d9b64
FB
703/* rounding primitives */
704#define avg2(a,b) ((a+b+1)>>1)
705#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
706
de6d9b64
FB
707#define op_avg(a, b) a = avg2(a, b)
708#define op_sub(a, b) a -= b
709
de6d9b64
FB
710PIXOP(DCTELEM, sub, op_sub, 8)
711
712/* not rounding primitives */
713#undef avg2
714#undef avg4
715#define avg2(a,b) ((a+b)>>1)
716#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
717
de6d9b64
FB
718/* motion estimation */
719
720#undef avg2
721#undef avg4
57060b1e
FB
722#endif
723
de6d9b64
FB
724#define avg2(a,b) ((a+b+1)>>1)
725#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
726
44eb4951
MN
727static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
728{
729 const int A=(16-x16)*(16-y16);
730 const int B=( x16)*(16-y16);
731 const int C=(16-x16)*( y16);
732 const int D=( x16)*( y16);
733 int i;
734 rounder= 128 - rounder;
735
736 for(i=0; i<h; i++)
737 {
738 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
739 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
740 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
741 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
742 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
743 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
744 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
745 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
746 dst+= srcStride;
747 src+= srcStride;
748 }
749}
750
751static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
752{
753 UINT8 *cm = cropTbl + MAX_NEG_CROP;
754 int i;
755 for(i=0; i<h; i++)
756 {
ba6802de
MN
757 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
758 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
759 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
760 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
761 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
762 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
763 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
764 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
44eb4951
MN
765 dst+=dstStride;
766 src+=srcStride;
767 }
768}
769
770static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
771{
772 UINT8 *cm = cropTbl + MAX_NEG_CROP;
773 int i;
774 for(i=0; i<w; i++)
775 {
776 const int src0= src[0*srcStride];
777 const int src1= src[1*srcStride];
778 const int src2= src[2*srcStride];
779 const int src3= src[3*srcStride];
780 const int src4= src[4*srcStride];
781 const int src5= src[5*srcStride];
782 const int src6= src[6*srcStride];
783 const int src7= src[7*srcStride];
784 const int src8= src[8*srcStride];
ba6802de
MN
785 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
786 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
787 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
788 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
789 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
790 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
791 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
792 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
44eb4951
MN
793 dst++;
794 src++;
795 }
796}
797
798static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
799{
800 int i;
801 for(i=0; i<8; i++)
802 {
803 dst[0]= src[0];
804 dst[1]= src[1];
805 dst[2]= src[2];
806 dst[3]= src[3];
807 dst[4]= src[4];
808 dst[5]= src[5];
809 dst[6]= src[6];
810 dst[7]= src[7];
811 dst+=dstStride;
812 src+=srcStride;
813 }
814}
815
816static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
817{
818 int i;
819 for(i=0; i<8; i++)
820 {
821 dst[0]= (src1[0] + src2[0] + r)>>1;
822 dst[1]= (src1[1] + src2[1] + r)>>1;
823 dst[2]= (src1[2] + src2[2] + r)>>1;
824 dst[3]= (src1[3] + src2[3] + r)>>1;
825 dst[4]= (src1[4] + src2[4] + r)>>1;
826 dst[5]= (src1[5] + src2[5] + r)>>1;
827 dst[6]= (src1[6] + src2[6] + r)>>1;
828 dst[7]= (src1[7] + src2[7] + r)>>1;
829 dst+=dstStride;
830 src1+=srcStride;
831 src2+=8;
832 }
833}
834
835static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
836{
837 int i;
838 for(i=0; i<8; i++)
839 {
840 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
841 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
842 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
843 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
844 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
845 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
846 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
847 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
848 dst+=dstStride;
849 src1+=srcStride;
850 src2+=8;
7ff037e9 851 src3+=8;
44eb4951
MN
852 src4+=8;
853 }
854}
855
856#define QPEL_MC(r, name) \
857static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
858{\
859 put_block(dst, src, dstStride, srcStride);\
860}\
861\
862static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
863{\
864 UINT8 half[64];\
ba6802de 865 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
866 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
867}\
868\
869static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
870{\
ba6802de 871 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
44eb4951
MN
872}\
873\
874static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
875{\
876 UINT8 half[64];\
ba6802de 877 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
878 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
879}\
880\
881static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
882{\
883 UINT8 half[64];\
ba6802de 884 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
885 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
886}\
887\
888static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
889{\
ba6802de 890 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
44eb4951
MN
891}\
892\
893static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
894{\
895 UINT8 half[64];\
ba6802de 896 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
897 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
898}\
899static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
900{\
901 UINT8 halfH[72];\
7ff037e9 902 UINT8 halfV[64];\
44eb4951 903 UINT8 halfHV[64];\
ba6802de
MN
904 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
905 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
906 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
907 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
908}\
909static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
910{\
911 UINT8 halfH[72];\
7ff037e9 912 UINT8 halfV[64];\
44eb4951 913 UINT8 halfHV[64];\
ba6802de
MN
914 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
915 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
916 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
917 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
918}\
919static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
920{\
921 UINT8 halfH[72];\
7ff037e9 922 UINT8 halfV[64];\
44eb4951 923 UINT8 halfHV[64];\
ba6802de
MN
924 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
925 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
926 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 927 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
44eb4951
MN
928}\
929static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
930{\
931 UINT8 halfH[72];\
7ff037e9 932 UINT8 halfV[64];\
44eb4951 933 UINT8 halfHV[64];\
ba6802de
MN
934 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
935 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
936 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 937 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
44eb4951
MN
938}\
939static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
940{\
941 UINT8 halfH[72];\
942 UINT8 halfHV[64];\
ba6802de
MN
943 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
944 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
945 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
946}\
947static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
948{\
949 UINT8 halfH[72];\
950 UINT8 halfHV[64];\
ba6802de
MN
951 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
952 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
953 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
954}\
955static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
956{\
957 UINT8 halfH[72];\
7ff037e9 958 UINT8 halfV[64];\
44eb4951 959 UINT8 halfHV[64];\
ba6802de
MN
960 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
961 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
962 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 963 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
44eb4951
MN
964}\
965static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
966{\
967 UINT8 halfH[72];\
7ff037e9 968 UINT8 halfV[64];\
44eb4951 969 UINT8 halfHV[64];\
ba6802de
MN
970 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
971 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
972 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 973 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
44eb4951
MN
974}\
975static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
976{\
977 UINT8 halfH[72];\
ba6802de
MN
978 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
979 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
44eb4951
MN
980}\
981qpel_mc_func qpel_mc ## name ## _tab[16]={ \
982 qpel_mc00_c ## name, \
983 qpel_mc10_c ## name, \
984 qpel_mc20_c ## name, \
985 qpel_mc30_c ## name, \
986 qpel_mc01_c ## name, \
987 qpel_mc11_c ## name, \
988 qpel_mc21_c ## name, \
989 qpel_mc31_c ## name, \
990 qpel_mc02_c ## name, \
991 qpel_mc12_c ## name, \
992 qpel_mc22_c ## name, \
993 qpel_mc32_c ## name, \
994 qpel_mc03_c ## name, \
995 qpel_mc13_c ## name, \
996 qpel_mc23_c ## name, \
997 qpel_mc33_c ## name, \
998};
999
1000QPEL_MC(0, _rnd)
1001QPEL_MC(1, _no_rnd)
1002
ba6802de 1003int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1004{
1005 int s, i;
1006
1007 s = 0;
ba6802de 1008 for(i=0;i<16;i++) {
de6d9b64
FB
1009 s += abs(pix1[0] - pix2[0]);
1010 s += abs(pix1[1] - pix2[1]);
1011 s += abs(pix1[2] - pix2[2]);
1012 s += abs(pix1[3] - pix2[3]);
1013 s += abs(pix1[4] - pix2[4]);
1014 s += abs(pix1[5] - pix2[5]);
1015 s += abs(pix1[6] - pix2[6]);
1016 s += abs(pix1[7] - pix2[7]);
1017 s += abs(pix1[8] - pix2[8]);
1018 s += abs(pix1[9] - pix2[9]);
1019 s += abs(pix1[10] - pix2[10]);
1020 s += abs(pix1[11] - pix2[11]);
1021 s += abs(pix1[12] - pix2[12]);
1022 s += abs(pix1[13] - pix2[13]);
1023 s += abs(pix1[14] - pix2[14]);
1024 s += abs(pix1[15] - pix2[15]);
1025 pix1 += line_size;
1026 pix2 += line_size;
1027 }
1028 return s;
1029}
1030
ba6802de 1031int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1032{
1033 int s, i;
1034
1035 s = 0;
ba6802de 1036 for(i=0;i<16;i++) {
de6d9b64
FB
1037 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1038 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1039 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1040 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1041 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1042 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1043 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1044 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1045 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1046 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1047 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1048 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1049 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1050 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1051 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1052 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1053 pix1 += line_size;
1054 pix2 += line_size;
1055 }
1056 return s;
1057}
1058
ba6802de 1059int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1060{
1061 int s, i;
1062 UINT8 *pix3 = pix2 + line_size;
1063
1064 s = 0;
ba6802de 1065 for(i=0;i<16;i++) {
de6d9b64
FB
1066 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1067 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1068 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1069 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1070 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1071 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1072 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1073 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1074 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1075 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1076 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1077 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1078 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1079 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1080 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1081 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1082 pix1 += line_size;
1083 pix2 += line_size;
1084 pix3 += line_size;
1085 }
1086 return s;
1087}
1088
ba6802de 1089int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1090{
1091 int s, i;
1092 UINT8 *pix3 = pix2 + line_size;
1093
1094 s = 0;
ba6802de 1095 for(i=0;i<16;i++) {
de6d9b64
FB
1096 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1097 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1098 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1099 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1100 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1101 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1102 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1103 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1104 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1105 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1106 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1107 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1108 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1109 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1110 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1111 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1112 pix1 += line_size;
1113 pix2 += line_size;
1114 pix3 += line_size;
1115 }
1116 return s;
1117}
1118
ba6802de
MN
1119int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1120{
1121 int s, i;
1122
1123 s = 0;
1124 for(i=0;i<8;i++) {
1125 s += abs(pix1[0] - pix2[0]);
1126 s += abs(pix1[1] - pix2[1]);
1127 s += abs(pix1[2] - pix2[2]);
1128 s += abs(pix1[3] - pix2[3]);
1129 s += abs(pix1[4] - pix2[4]);
1130 s += abs(pix1[5] - pix2[5]);
1131 s += abs(pix1[6] - pix2[6]);
1132 s += abs(pix1[7] - pix2[7]);
1133 pix1 += line_size;
1134 pix2 += line_size;
1135 }
1136 return s;
1137}
1138
1139int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1140{
1141 int s, i;
1142
1143 s = 0;
1144 for(i=0;i<8;i++) {
1145 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1146 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1147 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1148 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1149 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1150 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1151 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1152 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1153 pix1 += line_size;
1154 pix2 += line_size;
1155 }
1156 return s;
1157}
1158
1159int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1160{
1161 int s, i;
1162 UINT8 *pix3 = pix2 + line_size;
1163
1164 s = 0;
1165 for(i=0;i<8;i++) {
1166 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1167 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1168 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1169 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1170 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1171 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1172 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1173 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1174 pix1 += line_size;
1175 pix2 += line_size;
1176 pix3 += line_size;
1177 }
1178 return s;
1179}
1180
1181int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1182{
1183 int s, i;
1184 UINT8 *pix3 = pix2 + line_size;
1185
1186 s = 0;
1187 for(i=0;i<8;i++) {
1188 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1189 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1190 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1191 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1192 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1193 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1194 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1195 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1196 pix1 += line_size;
1197 pix2 += line_size;
1198 pix3 += line_size;
1199 }
1200 return s;
1201}
1202
e0eac44e
FB
1203/* permute block according so that it corresponds to the MMX idct
1204 order */
d962f6fd 1205#ifdef SIMPLE_IDCT
5a240838 1206 /* general permutation, but perhaps slightly slower */
d962f6fd
A
1207void block_permute(INT16 *block)
1208{
1209 int i;
1210 INT16 temp[64];
1211
d962f6fd
A
1212 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1213
1214 for(i=0; i<64; i++) block[i] = temp[i];
d962f6fd 1215}
d962f6fd
A
1216#else
1217
e0eac44e 1218void block_permute(INT16 *block)
de6d9b64 1219{
e0eac44e 1220 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
de6d9b64
FB
1221 int i;
1222
e0eac44e
FB
1223 for(i=0;i<8;i++) {
1224 tmp1 = block[1];
1225 tmp2 = block[2];
1226 tmp3 = block[3];
1227 tmp4 = block[4];
1228 tmp5 = block[5];
1229 tmp6 = block[6];
1230 block[1] = tmp2;
1231 block[2] = tmp4;
1232 block[3] = tmp6;
1233 block[4] = tmp1;
1234 block[5] = tmp3;
1235 block[6] = tmp5;
1236 block += 8;
1237 }
1238}
d962f6fd 1239#endif
e0eac44e 1240
649c00c9
MN
1241void clear_blocks_c(DCTELEM *blocks)
1242{
1243 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1244}
1245
e0eac44e
FB
1246void dsputil_init(void)
1247{
1248 int i, j;
c34270f5 1249 int use_permuted_idct;
e0eac44e 1250
de6d9b64
FB
1251 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1252 for(i=0;i<MAX_NEG_CROP;i++) {
1253 cropTbl[i] = 0;
1254 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1255 }
1256
1257 for(i=0;i<512;i++) {
1258 squareTbl[i] = (i - 256) * (i - 256);
1259 }
1260
d962f6fd
A
1261#ifdef SIMPLE_IDCT
1262 ff_idct = simple_idct;
1263#else
4af7bcc1 1264 ff_idct = j_rev_dct;
d962f6fd 1265#endif
de6d9b64 1266 get_pixels = get_pixels_c;
9dbcbd92 1267 diff_pixels = diff_pixels_c;
de6d9b64
FB
1268 put_pixels_clamped = put_pixels_clamped_c;
1269 add_pixels_clamped = add_pixels_clamped_c;
44eb4951 1270 gmc1= gmc1_c;
649c00c9 1271 clear_blocks= clear_blocks_c;
de6d9b64 1272
ba6802de
MN
1273 pix_abs16x16 = pix_abs16x16_c;
1274 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1275 pix_abs16x16_y2 = pix_abs16x16_y2_c;
de6d9b64 1276 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
ba6802de
MN
1277 pix_abs8x8 = pix_abs8x8_c;
1278 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1279 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1280 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
de6d9b64
FB
1281 av_fdct = jpeg_fdct_ifast;
1282
c34270f5 1283 use_permuted_idct = 1;
e0eac44e 1284
980fc7b8 1285#ifdef HAVE_MMX
de6d9b64
FB
1286 dsputil_init_mmx();
1287#endif
3d03c0a2
FB
1288#ifdef ARCH_ARMV4L
1289 dsputil_init_armv4l();
1290#endif
c34270f5
FB
1291#ifdef HAVE_MLIB
1292 dsputil_init_mlib();
1293 use_permuted_idct = 0;
1294#endif
1e98dffb
NK
1295#ifdef ARCH_ALPHA
1296 dsputil_init_alpha();
1297 use_permuted_idct = 0;
1298#endif
c34270f5 1299
d962f6fd
A
1300#ifdef SIMPLE_IDCT
1301 if(ff_idct == simple_idct) use_permuted_idct=0;
1302#endif
1303
5a240838
MN
1304 if(use_permuted_idct)
1305#ifdef SIMPLE_IDCT
1306 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1307#else
1308 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1309#endif
1310 else
1311 for(i=0; i<64; i++) permutation[i]=i;
1312
2f349de2
MN
1313 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1314 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1315
c34270f5
FB
1316 if (use_permuted_idct) {
1317 /* permute for IDCT */
1318 for(i=0;i<64;i++) {
1319 j = zigzag_direct[i];
1320 zigzag_direct[i] = block_permute_op(j);
1321 j = ff_alternate_horizontal_scan[i];
1322 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1323 j = ff_alternate_vertical_scan[i];
1324 ff_alternate_vertical_scan[i] = block_permute_op(j);
1325 }
1326 block_permute(default_intra_matrix);
1327 block_permute(default_non_intra_matrix);
3bf43d42
MN
1328 block_permute(ff_mpeg4_default_intra_matrix);
1329 block_permute(ff_mpeg4_default_non_intra_matrix);
c34270f5 1330 }
badaf88e
MN
1331
1332 build_zigzag_end();
de6d9b64 1333}
43f1708f 1334
57060b1e
FB
1335/* remove any non bit exact operation (testing purpose) */
1336void avcodec_set_bit_exact(void)
1337{
1338#ifdef HAVE_MMX
1339 dsputil_set_bit_exact_mmx();
1340#endif
1341}
1342
43f1708f
J
1343void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1344 int orig_linesize[3], int coded_linesize,
1345 AVCodecContext *avctx)
1346{
1347 int quad, diff, x, y;
1348 UINT8 *orig, *coded;
1349 UINT32 *sq = squareTbl + 256;
1350
1351 quad = 0;
1352 diff = 0;
1353
1354 /* Luminance */
1355 orig = orig_image[0];
1356 coded = coded_image[0];
1357
1358 for (y=0;y<avctx->height;y++) {
1359 for (x=0;x<avctx->width;x++) {
1360 diff = *(orig + x) - *(coded + x);
1361 quad += sq[diff];
1362 }
1363 orig += orig_linesize[0];
1364 coded += coded_linesize;
1365 }
1366
1367 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1368
1369 if (avctx->psnr_y) {
1370 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1371 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1372 } else
1373 avctx->psnr_y = 99.99;
1374}
1375