c12d3d4fe5aa4deca4fae006c6a6740be9ad019b
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 */
21 #include "avcodec.h"
22 #include "dsputil.h"
23 /*
24 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
25 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
26 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
27 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
28 void (*ff_gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
29 void (*ff_gmc )(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy,
30 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
31 void (*clear_blocks)(DCTELEM *blocks);
32 int (*pix_sum)(UINT8 * pix, int line_size);
33 int (*pix_norm1)(UINT8 * pix, int line_size);
34
35 op_pixels_abs_func pix_abs16x16;
36 op_pixels_abs_func pix_abs16x16_x2;
37 op_pixels_abs_func pix_abs16x16_y2;
38 op_pixels_abs_func pix_abs16x16_xy2;
39
40 op_pixels_abs_func pix_abs8x8;
41 op_pixels_abs_func pix_abs8x8_x2;
42 op_pixels_abs_func pix_abs8x8_y2;
43 op_pixels_abs_func pix_abs8x8_xy2;
44 */
45 int ff_bit_exact=0;
46
47 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
48 UINT32 squareTbl[512];
49
50 const UINT8 ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
59 };
60
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 UINT16 __align8 inv_zigzag_direct16[64];
63
64 const UINT8 ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73 };
74
75 const UINT8 ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84 };
85
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const UINT32 inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120 };
121
122 static int pix_sum_c(UINT8 * pix, int line_size)
123 {
124 int s, i, j;
125
126 s = 0;
127 for (i = 0; i < 16; i++) {
128 for (j = 0; j < 16; j += 8) {
129 s += pix[0];
130 s += pix[1];
131 s += pix[2];
132 s += pix[3];
133 s += pix[4];
134 s += pix[5];
135 s += pix[6];
136 s += pix[7];
137 pix += 8;
138 }
139 pix += line_size - 16;
140 }
141 return s;
142 }
143
144 static int pix_norm1_c(UINT8 * pix, int line_size)
145 {
146 int s, i, j;
147 UINT32 *sq = squareTbl + 256;
148
149 s = 0;
150 for (i = 0; i < 16; i++) {
151 for (j = 0; j < 16; j += 8) {
152 s += sq[pix[0]];
153 s += sq[pix[1]];
154 s += sq[pix[2]];
155 s += sq[pix[3]];
156 s += sq[pix[4]];
157 s += sq[pix[5]];
158 s += sq[pix[6]];
159 s += sq[pix[7]];
160 pix += 8;
161 }
162 pix += line_size - 16;
163 }
164 return s;
165 }
166
167
168 static void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
169 {
170 int i;
171
172 /* read the pixels */
173 for(i=0;i<8;i++) {
174 block[0] = pixels[0];
175 block[1] = pixels[1];
176 block[2] = pixels[2];
177 block[3] = pixels[3];
178 block[4] = pixels[4];
179 block[5] = pixels[5];
180 block[6] = pixels[6];
181 block[7] = pixels[7];
182 pixels += line_size;
183 block += 8;
184 }
185 }
186
187 static void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1,
188 const UINT8 *s2, int stride){
189 int i;
190
191 /* read the pixels */
192 for(i=0;i<8;i++) {
193 block[0] = s1[0] - s2[0];
194 block[1] = s1[1] - s2[1];
195 block[2] = s1[2] - s2[2];
196 block[3] = s1[3] - s2[3];
197 block[4] = s1[4] - s2[4];
198 block[5] = s1[5] - s2[5];
199 block[6] = s1[6] - s2[6];
200 block[7] = s1[7] - s2[7];
201 s1 += stride;
202 s2 += stride;
203 block += 8;
204 }
205 }
206
207
208 static void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
209 int line_size)
210 {
211 int i;
212 UINT8 *cm = cropTbl + MAX_NEG_CROP;
213
214 /* read the pixels */
215 for(i=0;i<8;i++) {
216 pixels[0] = cm[block[0]];
217 pixels[1] = cm[block[1]];
218 pixels[2] = cm[block[2]];
219 pixels[3] = cm[block[3]];
220 pixels[4] = cm[block[4]];
221 pixels[5] = cm[block[5]];
222 pixels[6] = cm[block[6]];
223 pixels[7] = cm[block[7]];
224
225 pixels += line_size;
226 block += 8;
227 }
228 }
229
230 static void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
231 int line_size)
232 {
233 int i;
234 UINT8 *cm = cropTbl + MAX_NEG_CROP;
235
236 /* read the pixels */
237 for(i=0;i<8;i++) {
238 pixels[0] = cm[pixels[0] + block[0]];
239 pixels[1] = cm[pixels[1] + block[1]];
240 pixels[2] = cm[pixels[2] + block[2]];
241 pixels[3] = cm[pixels[3] + block[3]];
242 pixels[4] = cm[pixels[4] + block[4]];
243 pixels[5] = cm[pixels[5] + block[5]];
244 pixels[6] = cm[pixels[6] + block[6]];
245 pixels[7] = cm[pixels[7] + block[7]];
246 pixels += line_size;
247 block += 8;
248 }
249 }
250 #if 0
251
252 #define PIXOP2(OPNAME, OP) \
253 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
254 {\
255 int i;\
256 for(i=0; i<h; i++){\
257 OP(*((uint64_t*)block), LD64(pixels));\
258 pixels+=line_size;\
259 block +=line_size;\
260 }\
261 }\
262 \
263 static void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
264 {\
265 int i;\
266 for(i=0; i<h; i++){\
267 const uint64_t a= LD64(pixels );\
268 const uint64_t b= LD64(pixels+1);\
269 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
270 pixels+=line_size;\
271 block +=line_size;\
272 }\
273 }\
274 \
275 static void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
276 {\
277 int i;\
278 for(i=0; i<h; i++){\
279 const uint64_t a= LD64(pixels );\
280 const uint64_t b= LD64(pixels+1);\
281 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
282 pixels+=line_size;\
283 block +=line_size;\
284 }\
285 }\
286 \
287 static void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
288 {\
289 int i;\
290 for(i=0; i<h; i++){\
291 const uint64_t a= LD64(pixels );\
292 const uint64_t b= LD64(pixels+line_size);\
293 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
294 pixels+=line_size;\
295 block +=line_size;\
296 }\
297 }\
298 \
299 static void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
300 {\
301 int i;\
302 for(i=0; i<h; i++){\
303 const uint64_t a= LD64(pixels );\
304 const uint64_t b= LD64(pixels+line_size);\
305 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
306 pixels+=line_size;\
307 block +=line_size;\
308 }\
309 }\
310 \
311 static void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
312 {\
313 int i;\
314 const uint64_t a= LD64(pixels );\
315 const uint64_t b= LD64(pixels+1);\
316 uint64_t l0= (a&0x0303030303030303ULL)\
317 + (b&0x0303030303030303ULL)\
318 + 0x0202020202020202ULL;\
319 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
320 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
321 uint64_t l1,h1;\
322 \
323 pixels+=line_size;\
324 for(i=0; i<h; i+=2){\
325 uint64_t a= LD64(pixels );\
326 uint64_t b= LD64(pixels+1);\
327 l1= (a&0x0303030303030303ULL)\
328 + (b&0x0303030303030303ULL);\
329 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
330 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
331 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
332 pixels+=line_size;\
333 block +=line_size;\
334 a= LD64(pixels );\
335 b= LD64(pixels+1);\
336 l0= (a&0x0303030303030303ULL)\
337 + (b&0x0303030303030303ULL)\
338 + 0x0202020202020202ULL;\
339 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
340 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
341 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
342 pixels+=line_size;\
343 block +=line_size;\
344 }\
345 }\
346 \
347 static void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
348 {\
349 int i;\
350 const uint64_t a= LD64(pixels );\
351 const uint64_t b= LD64(pixels+1);\
352 uint64_t l0= (a&0x0303030303030303ULL)\
353 + (b&0x0303030303030303ULL)\
354 + 0x0101010101010101ULL;\
355 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
356 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
357 uint64_t l1,h1;\
358 \
359 pixels+=line_size;\
360 for(i=0; i<h; i+=2){\
361 uint64_t a= LD64(pixels );\
362 uint64_t b= LD64(pixels+1);\
363 l1= (a&0x0303030303030303ULL)\
364 + (b&0x0303030303030303ULL);\
365 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
366 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
367 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
368 pixels+=line_size;\
369 block +=line_size;\
370 a= LD64(pixels );\
371 b= LD64(pixels+1);\
372 l0= (a&0x0303030303030303ULL)\
373 + (b&0x0303030303030303ULL)\
374 + 0x0101010101010101ULL;\
375 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
376 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
377 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
378 pixels+=line_size;\
379 block +=line_size;\
380 }\
381 }\
382 \
383 CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels , 8)\
384 CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels_x2 , 8)\
385 CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels_y2 , 8)\
386 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels_xy2, 8)\
387 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels_x2 , 8)\
388 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels_y2 , 8)\
389 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels_xy2, 8)\
390 \
391 void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
392 {\
393 OPNAME ## _pixels,\
394 OPNAME ## _pixels_x2,\
395 OPNAME ## _pixels_y2,\
396 OPNAME ## _pixels_xy2},\
397 {\
398 OPNAME ## _pixels16,\
399 OPNAME ## _pixels16_x2,\
400 OPNAME ## _pixels16_y2,\
401 OPNAME ## _pixels16_xy2}\
402 };\
403 \
404 void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
405 {\
406 OPNAME ## _pixels,\
407 OPNAME ## _no_rnd_pixels_x2,\
408 OPNAME ## _no_rnd_pixels_y2,\
409 OPNAME ## _no_rnd_pixels_xy2},\
410 {\
411 OPNAME ## _pixels16,\
412 OPNAME ## _no_rnd_pixels16_x2,\
413 OPNAME ## _no_rnd_pixels16_y2,\
414 OPNAME ## _no_rnd_pixels16_xy2}\
415 };
416
417 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
418 #else // 64 bit variant
419
420 #define PIXOP2(OPNAME, OP) \
421 static void OPNAME ## _pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
422 int i;\
423 for(i=0; i<h; i++){\
424 OP(*((uint32_t*)(block )), LD32(pixels ));\
425 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
426 pixels+=line_size;\
427 block +=line_size;\
428 }\
429 }\
430 static inline void OPNAME ## _no_rnd_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
431 OPNAME ## _pixels8(block, pixels, line_size, h);\
432 }\
433 \
434 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
435 int src_stride1, int src_stride2, int h){\
436 int i;\
437 for(i=0; i<h; i++){\
438 uint32_t a,b;\
439 a= LD32(&src1[i*src_stride1 ]);\
440 b= LD32(&src2[i*src_stride2 ]);\
441 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
442 a= LD32(&src1[i*src_stride1+4]);\
443 b= LD32(&src2[i*src_stride2+4]);\
444 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
445 }\
446 }\
447 \
448 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
449 int src_stride1, int src_stride2, int h){\
450 int i;\
451 for(i=0; i<h; i++){\
452 uint32_t a,b;\
453 a= LD32(&src1[i*src_stride1 ]);\
454 b= LD32(&src2[i*src_stride2 ]);\
455 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
456 a= LD32(&src1[i*src_stride1+4]);\
457 b= LD32(&src2[i*src_stride2+4]);\
458 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
459 }\
460 }\
461 \
462 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
463 int src_stride1, int src_stride2, int h){\
464 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
465 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
466 }\
467 \
468 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
469 int src_stride1, int src_stride2, int h){\
470 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
471 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
472 }\
473 \
474 static inline void OPNAME ## _no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
475 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
476 }\
477 \
478 static inline void OPNAME ## _pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
479 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
480 }\
481 \
482 static inline void OPNAME ## _no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
483 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
484 }\
485 \
486 static inline void OPNAME ## _pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
487 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
488 }\
489 \
490 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
491 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
492 int i;\
493 for(i=0; i<h; i++){\
494 uint32_t a, b, c, d, l0, l1, h0, h1;\
495 a= LD32(&src1[i*src_stride1]);\
496 b= LD32(&src2[i*src_stride2]);\
497 c= LD32(&src3[i*src_stride3]);\
498 d= LD32(&src4[i*src_stride4]);\
499 l0= (a&0x03030303UL)\
500 + (b&0x03030303UL)\
501 + 0x02020202UL;\
502 h0= ((a&0xFCFCFCFCUL)>>2)\
503 + ((b&0xFCFCFCFCUL)>>2);\
504 l1= (c&0x03030303UL)\
505 + (d&0x03030303UL);\
506 h1= ((c&0xFCFCFCFCUL)>>2)\
507 + ((d&0xFCFCFCFCUL)>>2);\
508 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
509 a= LD32(&src1[i*src_stride1+4]);\
510 b= LD32(&src2[i*src_stride2+4]);\
511 c= LD32(&src3[i*src_stride3+4]);\
512 d= LD32(&src4[i*src_stride4+4]);\
513 l0= (a&0x03030303UL)\
514 + (b&0x03030303UL)\
515 + 0x02020202UL;\
516 h0= ((a&0xFCFCFCFCUL)>>2)\
517 + ((b&0xFCFCFCFCUL)>>2);\
518 l1= (c&0x03030303UL)\
519 + (d&0x03030303UL);\
520 h1= ((c&0xFCFCFCFCUL)>>2)\
521 + ((d&0xFCFCFCFCUL)>>2);\
522 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
523 }\
524 }\
525 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
526 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
527 int i;\
528 for(i=0; i<h; i++){\
529 uint32_t a, b, c, d, l0, l1, h0, h1;\
530 a= LD32(&src1[i*src_stride1]);\
531 b= LD32(&src2[i*src_stride2]);\
532 c= LD32(&src3[i*src_stride3]);\
533 d= LD32(&src4[i*src_stride4]);\
534 l0= (a&0x03030303UL)\
535 + (b&0x03030303UL)\
536 + 0x01010101UL;\
537 h0= ((a&0xFCFCFCFCUL)>>2)\
538 + ((b&0xFCFCFCFCUL)>>2);\
539 l1= (c&0x03030303UL)\
540 + (d&0x03030303UL);\
541 h1= ((c&0xFCFCFCFCUL)>>2)\
542 + ((d&0xFCFCFCFCUL)>>2);\
543 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
544 a= LD32(&src1[i*src_stride1+4]);\
545 b= LD32(&src2[i*src_stride2+4]);\
546 c= LD32(&src3[i*src_stride3+4]);\
547 d= LD32(&src4[i*src_stride4+4]);\
548 l0= (a&0x03030303UL)\
549 + (b&0x03030303UL)\
550 + 0x01010101UL;\
551 h0= ((a&0xFCFCFCFCUL)>>2)\
552 + ((b&0xFCFCFCFCUL)>>2);\
553 l1= (c&0x03030303UL)\
554 + (d&0x03030303UL);\
555 h1= ((c&0xFCFCFCFCUL)>>2)\
556 + ((d&0xFCFCFCFCUL)>>2);\
557 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
558 }\
559 }\
560 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
561 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
562 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
563 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
564 }\
565 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
566 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
567 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
568 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
569 }\
570 \
571 static inline void OPNAME ## _pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
572 {\
573 int j;\
574 for(j=0; j<2; j++){\
575 int i;\
576 const uint32_t a= LD32(pixels );\
577 const uint32_t b= LD32(pixels+1);\
578 uint32_t l0= (a&0x03030303UL)\
579 + (b&0x03030303UL)\
580 + 0x02020202UL;\
581 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
582 + ((b&0xFCFCFCFCUL)>>2);\
583 uint32_t l1,h1;\
584 \
585 pixels+=line_size;\
586 for(i=0; i<h; i+=2){\
587 uint32_t a= LD32(pixels );\
588 uint32_t b= LD32(pixels+1);\
589 l1= (a&0x03030303UL)\
590 + (b&0x03030303UL);\
591 h1= ((a&0xFCFCFCFCUL)>>2)\
592 + ((b&0xFCFCFCFCUL)>>2);\
593 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
594 pixels+=line_size;\
595 block +=line_size;\
596 a= LD32(pixels );\
597 b= LD32(pixels+1);\
598 l0= (a&0x03030303UL)\
599 + (b&0x03030303UL)\
600 + 0x02020202UL;\
601 h0= ((a&0xFCFCFCFCUL)>>2)\
602 + ((b&0xFCFCFCFCUL)>>2);\
603 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
604 pixels+=line_size;\
605 block +=line_size;\
606 }\
607 pixels+=4-line_size*(h+1);\
608 block +=4-line_size*h;\
609 }\
610 }\
611 \
612 static inline void OPNAME ## _no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
613 {\
614 int j;\
615 for(j=0; j<2; j++){\
616 int i;\
617 const uint32_t a= LD32(pixels );\
618 const uint32_t b= LD32(pixels+1);\
619 uint32_t l0= (a&0x03030303UL)\
620 + (b&0x03030303UL)\
621 + 0x01010101UL;\
622 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
623 + ((b&0xFCFCFCFCUL)>>2);\
624 uint32_t l1,h1;\
625 \
626 pixels+=line_size;\
627 for(i=0; i<h; i+=2){\
628 uint32_t a= LD32(pixels );\
629 uint32_t b= LD32(pixels+1);\
630 l1= (a&0x03030303UL)\
631 + (b&0x03030303UL);\
632 h1= ((a&0xFCFCFCFCUL)>>2)\
633 + ((b&0xFCFCFCFCUL)>>2);\
634 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
635 pixels+=line_size;\
636 block +=line_size;\
637 a= LD32(pixels );\
638 b= LD32(pixels+1);\
639 l0= (a&0x03030303UL)\
640 + (b&0x03030303UL)\
641 + 0x01010101UL;\
642 h0= ((a&0xFCFCFCFCUL)>>2)\
643 + ((b&0xFCFCFCFCUL)>>2);\
644 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
645 pixels+=line_size;\
646 block +=line_size;\
647 }\
648 pixels+=4-line_size*(h+1);\
649 block +=4-line_size*h;\
650 }\
651 }\
652 \
653 CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels8 , 8)\
654 CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels8_x2 , 8)\
655 CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels8_y2 , 8)\
656 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels8_xy2, 8)\
657 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16 , OPNAME ## _pixels8 , 8)\
658 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels8_x2 , 8)\
659 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels8_y2 , 8)\
660 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels8_xy2, 8)\
661 \
662 void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
663 {\
664 OPNAME ## _pixels16,\
665 OPNAME ## _pixels16_x2,\
666 OPNAME ## _pixels16_y2,\
667 OPNAME ## _pixels16_xy2},\
668 {\
669 OPNAME ## _pixels8,\
670 OPNAME ## _pixels8_x2,\
671 OPNAME ## _pixels8_y2,\
672 OPNAME ## _pixels8_xy2},\
673 };\
674 \
675 void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
676 {\
677 OPNAME ## _pixels16,\
678 OPNAME ## _no_rnd_pixels16_x2,\
679 OPNAME ## _no_rnd_pixels16_y2,\
680 OPNAME ## _no_rnd_pixels16_xy2},\
681 {\
682 OPNAME ## _pixels8,\
683 OPNAME ## _no_rnd_pixels8_x2,\
684 OPNAME ## _no_rnd_pixels8_y2,\
685 OPNAME ## _no_rnd_pixels8_xy2},\
686 };
687
688 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
689 #endif
690 #define op_put(a, b) a = b
691
692 PIXOP2(avg, op_avg)
693 PIXOP2(put, op_put)
694 #undef op_avg
695 #undef op_put
696
697 #if 0
698 /* FIXME this stuff could be removed as its ot really used anymore */
699 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
700 \
701 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
702 { \
703 BTYPE *p; \
704 const UINT8 *pix; \
705 \
706 p = block; \
707 pix = pixels; \
708 do { \
709 OP(p[0], pix[0]); \
710 OP(p[1], pix[1]); \
711 OP(p[2], pix[2]); \
712 OP(p[3], pix[3]); \
713 OP(p[4], pix[4]); \
714 OP(p[5], pix[5]); \
715 OP(p[6], pix[6]); \
716 OP(p[7], pix[7]); \
717 pix += line_size; \
718 p += INCR; \
719 } while (--h);; \
720 } \
721 \
722 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
723 { \
724 BTYPE *p; \
725 const UINT8 *pix; \
726 \
727 p = block; \
728 pix = pixels; \
729 do { \
730 OP(p[0], avg2(pix[0], pix[1])); \
731 OP(p[1], avg2(pix[1], pix[2])); \
732 OP(p[2], avg2(pix[2], pix[3])); \
733 OP(p[3], avg2(pix[3], pix[4])); \
734 OP(p[4], avg2(pix[4], pix[5])); \
735 OP(p[5], avg2(pix[5], pix[6])); \
736 OP(p[6], avg2(pix[6], pix[7])); \
737 OP(p[7], avg2(pix[7], pix[8])); \
738 pix += line_size; \
739 p += INCR; \
740 } while (--h); \
741 } \
742 \
743 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
744 { \
745 BTYPE *p; \
746 const UINT8 *pix; \
747 const UINT8 *pix1; \
748 \
749 p = block; \
750 pix = pixels; \
751 pix1 = pixels + line_size; \
752 do { \
753 OP(p[0], avg2(pix[0], pix1[0])); \
754 OP(p[1], avg2(pix[1], pix1[1])); \
755 OP(p[2], avg2(pix[2], pix1[2])); \
756 OP(p[3], avg2(pix[3], pix1[3])); \
757 OP(p[4], avg2(pix[4], pix1[4])); \
758 OP(p[5], avg2(pix[5], pix1[5])); \
759 OP(p[6], avg2(pix[6], pix1[6])); \
760 OP(p[7], avg2(pix[7], pix1[7])); \
761 pix += line_size; \
762 pix1 += line_size; \
763 p += INCR; \
764 } while(--h); \
765 } \
766 \
767 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
768 { \
769 BTYPE *p; \
770 const UINT8 *pix; \
771 const UINT8 *pix1; \
772 \
773 p = block; \
774 pix = pixels; \
775 pix1 = pixels + line_size; \
776 do { \
777 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
778 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
779 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
780 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
781 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
782 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
783 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
784 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
785 pix += line_size; \
786 pix1 += line_size; \
787 p += INCR; \
788 } while(--h); \
789 } \
790 \
791 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
792 OPNAME ## _pixels, \
793 OPNAME ## _pixels_x2, \
794 OPNAME ## _pixels_y2, \
795 OPNAME ## _pixels_xy2, \
796 };
797
798 /* rounding primitives */
799 #define avg2(a,b) ((a+b+1)>>1)
800 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
801
802 #define op_avg(a, b) a = avg2(a, b)
803 #define op_sub(a, b) a -= b
804 #define op_put(a, b) a = b
805
806 PIXOP(DCTELEM, sub, op_sub, 8)
807 PIXOP(uint8_t, avg, op_avg, line_size)
808 PIXOP(uint8_t, put, op_put, line_size)
809
810 /* not rounding primitives */
811 #undef avg2
812 #undef avg4
813 #define avg2(a,b) ((a+b)>>1)
814 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
815
816 PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
817 PIXOP(uint8_t, put_no_rnd, op_put, line_size)
818 /* motion estimation */
819
820 #undef avg2
821 #undef avg4
822 #endif
823
824 #define avg2(a,b) ((a+b+1)>>1)
825 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
826
827
828 static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
829 {
830 const int A=(16-x16)*(16-y16);
831 const int B=( x16)*(16-y16);
832 const int C=(16-x16)*( y16);
833 const int D=( x16)*( y16);
834 int i;
835
836 for(i=0; i<h; i++)
837 {
838 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
839 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
840 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
841 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
842 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
843 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
844 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
845 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
846 dst+= stride;
847 src+= stride;
848 }
849 }
850
851 static void gmc_c(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy,
852 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
853 {
854 int y, vx, vy;
855 const int s= 1<<shift;
856
857 width--;
858 height--;
859
860 for(y=0; y<h; y++){
861 int x;
862
863 vx= ox;
864 vy= oy;
865 for(x=0; x<8; x++){ //XXX FIXME optimize
866 int src_x, src_y, frac_x, frac_y, index;
867
868 src_x= vx>>16;
869 src_y= vy>>16;
870 frac_x= src_x&(s-1);
871 frac_y= src_y&(s-1);
872 src_x>>=shift;
873 src_y>>=shift;
874
875 if((unsigned)src_x < width){
876 if((unsigned)src_y < height){
877 index= src_x + src_y*stride;
878 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
879 + src[index +1]* frac_x )*(s-frac_y)
880 + ( src[index+stride ]*(s-frac_x)
881 + src[index+stride+1]* frac_x )* frac_y
882 + r)>>(shift*2);
883 }else{
884 index= src_x + clip(src_y, 0, height)*stride;
885 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
886 + src[index +1]* frac_x )*s
887 + r)>>(shift*2);
888 }
889 }else{
890 if((unsigned)src_y < height){
891 index= clip(src_x, 0, width) + src_y*stride;
892 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
893 + src[index+stride ]* frac_y )*s
894 + r)>>(shift*2);
895 }else{
896 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
897 dst[y*stride + x]= src[index ];
898 }
899 }
900
901 vx+= dxx;
902 vy+= dyx;
903 }
904 ox += dxy;
905 oy += dyy;
906 }
907 }
908
909 static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
910 {
911 int i;
912 for(i=0; i<h; i++)
913 {
914 ST32(dst , LD32(src ));
915 ST32(dst+4 , LD32(src+4 ));
916 ST32(dst+8 , LD32(src+8 ));
917 ST32(dst+12, LD32(src+12));
918 dst[16]= src[16];
919 dst+=dstStride;
920 src+=srcStride;
921 }
922 }
923
924 static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
925 {
926 int i;
927 for(i=0; i<h; i++)
928 {
929 ST32(dst , LD32(src ));
930 ST32(dst+4 , LD32(src+4 ));
931 dst[8]= src[8];
932 dst+=dstStride;
933 src+=srcStride;
934 }
935 }
936
937 #define QPEL_MC(r, OPNAME, RND, OP) \
938 static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
939 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
940 int i;\
941 for(i=0; i<h; i++)\
942 {\
943 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
944 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
945 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
946 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
947 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
948 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
949 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
950 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
951 dst+=dstStride;\
952 src+=srcStride;\
953 }\
954 }\
955 \
956 static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
957 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
958 int i;\
959 for(i=0; i<w; i++)\
960 {\
961 const int src0= src[0*srcStride];\
962 const int src1= src[1*srcStride];\
963 const int src2= src[2*srcStride];\
964 const int src3= src[3*srcStride];\
965 const int src4= src[4*srcStride];\
966 const int src5= src[5*srcStride];\
967 const int src6= src[6*srcStride];\
968 const int src7= src[7*srcStride];\
969 const int src8= src[8*srcStride];\
970 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
971 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
972 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
973 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
974 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
975 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
976 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
977 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
978 dst++;\
979 src++;\
980 }\
981 }\
982 \
983 static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
984 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
985 int i;\
986 for(i=0; i<h; i++)\
987 {\
988 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
989 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
990 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
991 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
992 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
993 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
994 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
995 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
996 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
997 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
998 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
999 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1000 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1001 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1002 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1003 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1004 dst+=dstStride;\
1005 src+=srcStride;\
1006 }\
1007 }\
1008 \
1009 static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
1010 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
1011 int i;\
1012 for(i=0; i<w; i++)\
1013 {\
1014 const int src0= src[0*srcStride];\
1015 const int src1= src[1*srcStride];\
1016 const int src2= src[2*srcStride];\
1017 const int src3= src[3*srcStride];\
1018 const int src4= src[4*srcStride];\
1019 const int src5= src[5*srcStride];\
1020 const int src6= src[6*srcStride];\
1021 const int src7= src[7*srcStride];\
1022 const int src8= src[8*srcStride];\
1023 const int src9= src[9*srcStride];\
1024 const int src10= src[10*srcStride];\
1025 const int src11= src[11*srcStride];\
1026 const int src12= src[12*srcStride];\
1027 const int src13= src[13*srcStride];\
1028 const int src14= src[14*srcStride];\
1029 const int src15= src[15*srcStride];\
1030 const int src16= src[16*srcStride];\
1031 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1032 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1033 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1034 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1035 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1036 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1037 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1038 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1039 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1040 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1041 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1042 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1043 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1044 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1045 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1046 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1047 dst++;\
1048 src++;\
1049 }\
1050 }\
1051 \
1052 static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1053 OPNAME ## pixels8(dst, src, stride, 8);\
1054 }\
1055 \
1056 static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1057 UINT8 half[64];\
1058 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1059 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1060 }\
1061 \
1062 static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1063 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1064 }\
1065 \
1066 static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1067 UINT8 half[64];\
1068 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1069 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1070 }\
1071 \
1072 static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1073 UINT8 full[16*9];\
1074 UINT8 half[64];\
1075 copy_block9(full, src, 16, stride, 9);\
1076 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1077 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1078 }\
1079 \
1080 static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1081 UINT8 full[16*9];\
1082 copy_block9(full, src, 16, stride, 9);\
1083 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
1084 }\
1085 \
1086 static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1087 UINT8 full[16*9];\
1088 UINT8 half[64];\
1089 copy_block9(full, src, 16, stride, 9);\
1090 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1091 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1092 }\
1093 static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1094 UINT8 full[16*9];\
1095 UINT8 halfH[72];\
1096 UINT8 halfV[64];\
1097 UINT8 halfHV[64];\
1098 copy_block9(full, src, 16, stride, 9);\
1099 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1100 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1101 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1102 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1103 }\
1104 static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1105 UINT8 full[16*9];\
1106 UINT8 halfH[72];\
1107 UINT8 halfV[64];\
1108 UINT8 halfHV[64];\
1109 copy_block9(full, src, 16, stride, 9);\
1110 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1111 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1112 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1113 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1114 }\
1115 static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1116 UINT8 full[16*9];\
1117 UINT8 halfH[72];\
1118 UINT8 halfV[64];\
1119 UINT8 halfHV[64];\
1120 copy_block9(full, src, 16, stride, 9);\
1121 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1122 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1123 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1124 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1125 }\
1126 static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1127 UINT8 full[16*9];\
1128 UINT8 halfH[72];\
1129 UINT8 halfV[64];\
1130 UINT8 halfHV[64];\
1131 copy_block9(full, src, 16, stride, 9);\
1132 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1133 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1134 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1135 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1136 }\
1137 static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1138 UINT8 halfH[72];\
1139 UINT8 halfHV[64];\
1140 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1141 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1142 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1143 }\
1144 static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1145 UINT8 halfH[72];\
1146 UINT8 halfHV[64];\
1147 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1148 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1149 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1150 }\
1151 static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1152 UINT8 full[16*9];\
1153 UINT8 halfH[72];\
1154 UINT8 halfV[64];\
1155 UINT8 halfHV[64];\
1156 copy_block9(full, src, 16, stride, 9);\
1157 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1158 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1159 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1160 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1161 }\
1162 static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1163 UINT8 full[16*9];\
1164 UINT8 halfH[72];\
1165 UINT8 halfV[64];\
1166 UINT8 halfHV[64];\
1167 copy_block9(full, src, 16, stride, 9);\
1168 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1169 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1170 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1171 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1172 }\
1173 static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1174 UINT8 halfH[72];\
1175 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1176 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
1177 }\
1178 static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1179 OPNAME ## pixels16(dst, src, stride, 16);\
1180 }\
1181 \
1182 static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1183 UINT8 half[256];\
1184 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1185 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1186 }\
1187 \
1188 static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1189 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1190 }\
1191 \
1192 static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1193 UINT8 half[256];\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1195 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1196 }\
1197 \
1198 static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1199 UINT8 full[24*17];\
1200 UINT8 half[256];\
1201 copy_block17(full, src, 24, stride, 17);\
1202 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1203 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1204 }\
1205 \
1206 static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1207 UINT8 full[24*17];\
1208 copy_block17(full, src, 24, stride, 17);\
1209 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1210 }\
1211 \
1212 static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1213 UINT8 full[24*17];\
1214 UINT8 half[256];\
1215 copy_block17(full, src, 24, stride, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1217 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1218 }\
1219 static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1220 UINT8 full[24*17];\
1221 UINT8 halfH[272];\
1222 UINT8 halfV[256];\
1223 UINT8 halfHV[256];\
1224 copy_block17(full, src, 24, stride, 17);\
1225 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1228 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1229 }\
1230 static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1231 UINT8 full[24*17];\
1232 UINT8 halfH[272];\
1233 UINT8 halfV[256];\
1234 UINT8 halfHV[256];\
1235 copy_block17(full, src, 24, stride, 17);\
1236 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1238 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1239 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1240 }\
1241 static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1242 UINT8 full[24*17];\
1243 UINT8 halfH[272];\
1244 UINT8 halfV[256];\
1245 UINT8 halfHV[256];\
1246 copy_block17(full, src, 24, stride, 17);\
1247 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1248 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1249 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1250 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1251 }\
1252 static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1253 UINT8 full[24*17];\
1254 UINT8 halfH[272];\
1255 UINT8 halfV[256];\
1256 UINT8 halfHV[256];\
1257 copy_block17(full, src, 24, stride, 17);\
1258 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1259 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1260 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1261 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1262 }\
1263 static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1264 UINT8 halfH[272];\
1265 UINT8 halfHV[256];\
1266 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1267 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1268 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1269 }\
1270 static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1271 UINT8 halfH[272];\
1272 UINT8 halfHV[256];\
1273 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1274 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1275 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1276 }\
1277 static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1278 UINT8 full[24*17];\
1279 UINT8 halfH[272];\
1280 UINT8 halfV[256];\
1281 UINT8 halfHV[256];\
1282 copy_block17(full, src, 24, stride, 17);\
1283 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1284 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1285 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1286 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1287 }\
1288 static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1289 UINT8 full[24*17];\
1290 UINT8 halfH[272];\
1291 UINT8 halfV[256];\
1292 UINT8 halfHV[256];\
1293 copy_block17(full, src, 24, stride, 17);\
1294 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1295 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1296 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1297 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1298 }\
1299 static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1300 UINT8 halfH[272];\
1301 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1302 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
1303 }\
1304 qpel_mc_func OPNAME ## qpel_pixels_tab[2][16]={ \
1305 {\
1306 OPNAME ## qpel16_mc00_c, \
1307 OPNAME ## qpel16_mc10_c, \
1308 OPNAME ## qpel16_mc20_c, \
1309 OPNAME ## qpel16_mc30_c, \
1310 OPNAME ## qpel16_mc01_c, \
1311 OPNAME ## qpel16_mc11_c, \
1312 OPNAME ## qpel16_mc21_c, \
1313 OPNAME ## qpel16_mc31_c, \
1314 OPNAME ## qpel16_mc02_c, \
1315 OPNAME ## qpel16_mc12_c, \
1316 OPNAME ## qpel16_mc22_c, \
1317 OPNAME ## qpel16_mc32_c, \
1318 OPNAME ## qpel16_mc03_c, \
1319 OPNAME ## qpel16_mc13_c, \
1320 OPNAME ## qpel16_mc23_c, \
1321 OPNAME ## qpel16_mc33_c, \
1322 },{\
1323 OPNAME ## qpel8_mc00_c, \
1324 OPNAME ## qpel8_mc10_c, \
1325 OPNAME ## qpel8_mc20_c, \
1326 OPNAME ## qpel8_mc30_c, \
1327 OPNAME ## qpel8_mc01_c, \
1328 OPNAME ## qpel8_mc11_c, \
1329 OPNAME ## qpel8_mc21_c, \
1330 OPNAME ## qpel8_mc31_c, \
1331 OPNAME ## qpel8_mc02_c, \
1332 OPNAME ## qpel8_mc12_c, \
1333 OPNAME ## qpel8_mc22_c, \
1334 OPNAME ## qpel8_mc32_c, \
1335 OPNAME ## qpel8_mc03_c, \
1336 OPNAME ## qpel8_mc13_c, \
1337 OPNAME ## qpel8_mc23_c, \
1338 OPNAME ## qpel8_mc33_c, \
1339 }\
1340 };
1341
1342 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1343 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1344 #define op_put(a, b) a = cm[((b) + 16)>>5]
1345 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1346
1347 QPEL_MC(0, put_ , _ , op_put)
1348 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1349 QPEL_MC(0, avg_ , _ , op_avg)
1350 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1351 #undef op_avg
1352 #undef op_avg_no_rnd
1353 #undef op_put
1354 #undef op_put_no_rnd
1355
1356 static int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1357 {
1358 int s, i;
1359
1360 s = 0;
1361 for(i=0;i<16;i++) {
1362 s += abs(pix1[0] - pix2[0]);
1363 s += abs(pix1[1] - pix2[1]);
1364 s += abs(pix1[2] - pix2[2]);
1365 s += abs(pix1[3] - pix2[3]);
1366 s += abs(pix1[4] - pix2[4]);
1367 s += abs(pix1[5] - pix2[5]);
1368 s += abs(pix1[6] - pix2[6]);
1369 s += abs(pix1[7] - pix2[7]);
1370 s += abs(pix1[8] - pix2[8]);
1371 s += abs(pix1[9] - pix2[9]);
1372 s += abs(pix1[10] - pix2[10]);
1373 s += abs(pix1[11] - pix2[11]);
1374 s += abs(pix1[12] - pix2[12]);
1375 s += abs(pix1[13] - pix2[13]);
1376 s += abs(pix1[14] - pix2[14]);
1377 s += abs(pix1[15] - pix2[15]);
1378 pix1 += line_size;
1379 pix2 += line_size;
1380 }
1381 return s;
1382 }
1383
1384 static int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1385 {
1386 int s, i;
1387
1388 s = 0;
1389 for(i=0;i<16;i++) {
1390 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1391 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1392 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1393 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1394 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1395 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1396 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1397 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1398 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1399 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1400 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1401 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1402 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1403 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1404 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1405 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1406 pix1 += line_size;
1407 pix2 += line_size;
1408 }
1409 return s;
1410 }
1411
1412 static int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1413 {
1414 int s, i;
1415 UINT8 *pix3 = pix2 + line_size;
1416
1417 s = 0;
1418 for(i=0;i<16;i++) {
1419 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1420 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1421 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1422 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1423 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1424 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1425 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1426 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1427 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1428 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1429 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1430 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1431 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1432 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1433 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1434 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1435 pix1 += line_size;
1436 pix2 += line_size;
1437 pix3 += line_size;
1438 }
1439 return s;
1440 }
1441
1442 static int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1443 {
1444 int s, i;
1445 UINT8 *pix3 = pix2 + line_size;
1446
1447 s = 0;
1448 for(i=0;i<16;i++) {
1449 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1450 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1451 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1452 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1453 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1454 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1455 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1456 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1457 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1458 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1459 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1460 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1461 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1462 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1463 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1464 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1465 pix1 += line_size;
1466 pix2 += line_size;
1467 pix3 += line_size;
1468 }
1469 return s;
1470 }
1471
1472 static int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1473 {
1474 int s, i;
1475
1476 s = 0;
1477 for(i=0;i<8;i++) {
1478 s += abs(pix1[0] - pix2[0]);
1479 s += abs(pix1[1] - pix2[1]);
1480 s += abs(pix1[2] - pix2[2]);
1481 s += abs(pix1[3] - pix2[3]);
1482 s += abs(pix1[4] - pix2[4]);
1483 s += abs(pix1[5] - pix2[5]);
1484 s += abs(pix1[6] - pix2[6]);
1485 s += abs(pix1[7] - pix2[7]);
1486 pix1 += line_size;
1487 pix2 += line_size;
1488 }
1489 return s;
1490 }
1491
1492 static int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1493 {
1494 int s, i;
1495
1496 s = 0;
1497 for(i=0;i<8;i++) {
1498 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1499 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1500 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1501 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1502 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1503 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1504 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1505 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1506 pix1 += line_size;
1507 pix2 += line_size;
1508 }
1509 return s;
1510 }
1511
1512 static int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1513 {
1514 int s, i;
1515 UINT8 *pix3 = pix2 + line_size;
1516
1517 s = 0;
1518 for(i=0;i<8;i++) {
1519 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1520 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1521 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1522 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1523 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1524 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1525 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1526 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1527 pix1 += line_size;
1528 pix2 += line_size;
1529 pix3 += line_size;
1530 }
1531 return s;
1532 }
1533
1534 static int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1535 {
1536 int s, i;
1537 UINT8 *pix3 = pix2 + line_size;
1538
1539 s = 0;
1540 for(i=0;i<8;i++) {
1541 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1542 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1543 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1544 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1545 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1546 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1547 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1548 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1549 pix1 += line_size;
1550 pix2 += line_size;
1551 pix3 += line_size;
1552 }
1553 return s;
1554 }
1555
1556 void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last)
1557 {
1558 int i;
1559 INT16 temp[64];
1560
1561 if(last<=0) return;
1562 if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
1563
1564 for(i=0; i<=last; i++){
1565 const int j= scantable[i];
1566 temp[j]= block[j];
1567 block[j]=0;
1568 }
1569
1570 for(i=0; i<=last; i++){
1571 const int j= scantable[i];
1572 const int perm_j= permutation[j];
1573 block[perm_j]= temp[j];
1574 }
1575 }
1576
1577 static void clear_blocks_c(DCTELEM *blocks)
1578 {
1579 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1580 }
1581
1582 void dsputil_init(DSPContext* c, unsigned mask)
1583 {
1584 int i;
1585
1586 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1587 for(i=0;i<MAX_NEG_CROP;i++) {
1588 cropTbl[i] = 0;
1589 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1590 }
1591
1592 for(i=0;i<512;i++) {
1593 squareTbl[i] = (i - 256) * (i - 256);
1594 }
1595
1596 c->get_pixels = get_pixels_c;
1597 c->diff_pixels = diff_pixels_c;
1598 c->put_pixels_clamped = put_pixels_clamped_c;
1599 c->add_pixels_clamped = add_pixels_clamped_c;
1600 c->gmc1 = gmc1_c;
1601 c->gmc = gmc_c;
1602 c->clear_blocks = clear_blocks_c;
1603 c->pix_sum = pix_sum_c;
1604 c->pix_norm1 = pix_norm1_c;
1605
1606 c->pix_abs16x16 = pix_abs16x16_c;
1607 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
1608 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
1609 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1610 c->pix_abs8x8 = pix_abs8x8_c;
1611 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
1612 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
1613 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1614
1615 c->put_pixels_tab[0][0] = put_pixels16;
1616 c->put_pixels_tab[0][1] = put_pixels16_x2;
1617 c->put_pixels_tab[0][2] = put_pixels16_y2;
1618 c->put_pixels_tab[0][3] = put_pixels16_xy2;
1619
1620 c->put_no_rnd_pixels_tab[0][0] = put_pixels16;
1621 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2;
1622 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2;
1623 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2;
1624
1625 c->avg_pixels_tab[0][0] = avg_pixels16;
1626 c->avg_pixels_tab[0][1] = avg_pixels16_x2;
1627 c->avg_pixels_tab[0][2] = avg_pixels16_y2;
1628 c->avg_pixels_tab[0][3] = avg_pixels16_xy2;
1629
1630 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16;
1631 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2;
1632 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2;
1633 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2;
1634
1635 c->put_pixels_tab[1][0] = put_pixels8;
1636 c->put_pixels_tab[1][1] = put_pixels8_x2;
1637 c->put_pixels_tab[1][2] = put_pixels8_y2;
1638 c->put_pixels_tab[1][3] = put_pixels8_xy2;
1639
1640 c->put_no_rnd_pixels_tab[1][0] = put_pixels8;
1641 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2;
1642 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2;
1643 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2;
1644
1645 c->avg_pixels_tab[1][0] = avg_pixels8;
1646 c->avg_pixels_tab[1][1] = avg_pixels8_x2;
1647 c->avg_pixels_tab[1][2] = avg_pixels8_y2;
1648 c->avg_pixels_tab[1][3] = avg_pixels8_xy2;
1649
1650 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8;
1651 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2;
1652 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2;
1653 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2;
1654
1655 c->put_qpel_pixels_tab[0][ 0]= put_qpel16_mc00_c;
1656 c->put_qpel_pixels_tab[0][ 1]= put_qpel16_mc10_c;
1657 c->put_qpel_pixels_tab[0][ 2]= put_qpel16_mc20_c;
1658 c->put_qpel_pixels_tab[0][ 3]= put_qpel16_mc30_c;
1659 c->put_qpel_pixels_tab[0][ 4]= put_qpel16_mc01_c;
1660 c->put_qpel_pixels_tab[0][ 5]= put_qpel16_mc11_c;
1661 c->put_qpel_pixels_tab[0][ 6]= put_qpel16_mc21_c;
1662 c->put_qpel_pixels_tab[0][ 7]= put_qpel16_mc31_c;
1663 c->put_qpel_pixels_tab[0][ 8]= put_qpel16_mc02_c;
1664 c->put_qpel_pixels_tab[0][ 9]= put_qpel16_mc12_c;
1665 c->put_qpel_pixels_tab[0][10]= put_qpel16_mc22_c;
1666 c->put_qpel_pixels_tab[0][11]= put_qpel16_mc32_c;
1667 c->put_qpel_pixels_tab[0][12]= put_qpel16_mc03_c;
1668 c->put_qpel_pixels_tab[0][13]= put_qpel16_mc13_c;
1669 c->put_qpel_pixels_tab[0][14]= put_qpel16_mc23_c;
1670 c->put_qpel_pixels_tab[0][15]= put_qpel16_mc33_c;
1671
1672 c->put_no_rnd_qpel_pixels_tab[0][ 0]= put_no_rnd_qpel16_mc00_c;
1673 c->put_no_rnd_qpel_pixels_tab[0][ 1]= put_no_rnd_qpel16_mc10_c;
1674 c->put_no_rnd_qpel_pixels_tab[0][ 2]= put_no_rnd_qpel16_mc20_c;
1675 c->put_no_rnd_qpel_pixels_tab[0][ 3]= put_no_rnd_qpel16_mc30_c;
1676 c->put_no_rnd_qpel_pixels_tab[0][ 4]= put_no_rnd_qpel16_mc01_c;
1677 c->put_no_rnd_qpel_pixels_tab[0][ 5]= put_no_rnd_qpel16_mc11_c;
1678 c->put_no_rnd_qpel_pixels_tab[0][ 6]= put_no_rnd_qpel16_mc21_c;
1679 c->put_no_rnd_qpel_pixels_tab[0][ 7]= put_no_rnd_qpel16_mc31_c;
1680 c->put_no_rnd_qpel_pixels_tab[0][ 8]= put_no_rnd_qpel16_mc02_c;
1681 c->put_no_rnd_qpel_pixels_tab[0][ 9]= put_no_rnd_qpel16_mc12_c;
1682 c->put_no_rnd_qpel_pixels_tab[0][10]= put_no_rnd_qpel16_mc22_c;
1683 c->put_no_rnd_qpel_pixels_tab[0][11]= put_no_rnd_qpel16_mc32_c;
1684 c->put_no_rnd_qpel_pixels_tab[0][12]= put_no_rnd_qpel16_mc03_c;
1685 c->put_no_rnd_qpel_pixels_tab[0][13]= put_no_rnd_qpel16_mc13_c;
1686 c->put_no_rnd_qpel_pixels_tab[0][14]= put_no_rnd_qpel16_mc23_c;
1687 c->put_no_rnd_qpel_pixels_tab[0][15]= put_no_rnd_qpel16_mc33_c;
1688
1689 c->avg_qpel_pixels_tab[0][ 0]= avg_qpel16_mc00_c;
1690 c->avg_qpel_pixels_tab[0][ 1]= avg_qpel16_mc10_c;
1691 c->avg_qpel_pixels_tab[0][ 2]= avg_qpel16_mc20_c;
1692 c->avg_qpel_pixels_tab[0][ 3]= avg_qpel16_mc30_c;
1693 c->avg_qpel_pixels_tab[0][ 4]= avg_qpel16_mc01_c;
1694 c->avg_qpel_pixels_tab[0][ 5]= avg_qpel16_mc11_c;
1695 c->avg_qpel_pixels_tab[0][ 6]= avg_qpel16_mc21_c;
1696 c->avg_qpel_pixels_tab[0][ 7]= avg_qpel16_mc31_c;
1697 c->avg_qpel_pixels_tab[0][ 8]= avg_qpel16_mc02_c;
1698 c->avg_qpel_pixels_tab[0][ 9]= avg_qpel16_mc12_c;
1699 c->avg_qpel_pixels_tab[0][10]= avg_qpel16_mc22_c;
1700 c->avg_qpel_pixels_tab[0][11]= avg_qpel16_mc32_c;
1701 c->avg_qpel_pixels_tab[0][12]= avg_qpel16_mc03_c;
1702 c->avg_qpel_pixels_tab[0][13]= avg_qpel16_mc13_c;
1703 c->avg_qpel_pixels_tab[0][14]= avg_qpel16_mc23_c;
1704 c->avg_qpel_pixels_tab[0][15]= avg_qpel16_mc33_c;
1705 /*
1706 c->avg_no_rnd_qpel_pixels_tab[0][ 0]= avg_no_rnd_qpel16_mc00_c;
1707 c->avg_no_rnd_qpel_pixels_tab[0][ 1]= avg_no_rnd_qpel16_mc10_c;
1708 c->avg_no_rnd_qpel_pixels_tab[0][ 2]= avg_no_rnd_qpel16_mc20_c;
1709 c->avg_no_rnd_qpel_pixels_tab[0][ 3]= avg_no_rnd_qpel16_mc30_c;
1710 c->avg_no_rnd_qpel_pixels_tab[0][ 4]= avg_no_rnd_qpel16_mc01_c;
1711 c->avg_no_rnd_qpel_pixels_tab[0][ 5]= avg_no_rnd_qpel16_mc11_c;
1712 c->avg_no_rnd_qpel_pixels_tab[0][ 6]= avg_no_rnd_qpel16_mc21_c;
1713 c->avg_no_rnd_qpel_pixels_tab[0][ 7]= avg_no_rnd_qpel16_mc31_c;
1714 c->avg_no_rnd_qpel_pixels_tab[0][ 8]= avg_no_rnd_qpel16_mc02_c;
1715 c->avg_no_rnd_qpel_pixels_tab[0][ 9]= avg_no_rnd_qpel16_mc12_c;
1716 c->avg_no_rnd_qpel_pixels_tab[0][10]= avg_no_rnd_qpel16_mc22_c;
1717 c->avg_no_rnd_qpel_pixels_tab[0][11]= avg_no_rnd_qpel16_mc32_c;
1718 c->avg_no_rnd_qpel_pixels_tab[0][12]= avg_no_rnd_qpel16_mc03_c;
1719 c->avg_no_rnd_qpel_pixels_tab[0][13]= avg_no_rnd_qpel16_mc13_c;
1720 c->avg_no_rnd_qpel_pixels_tab[0][14]= avg_no_rnd_qpel16_mc23_c;
1721 c->avg_no_rnd_qpel_pixels_tab[0][15]= avg_no_rnd_qpel16_mc33_c;
1722 */
1723
1724 c->put_qpel_pixels_tab[1][ 0]= put_qpel8_mc00_c;
1725 c->put_qpel_pixels_tab[1][ 1]= put_qpel8_mc10_c;
1726 c->put_qpel_pixels_tab[1][ 2]= put_qpel8_mc20_c;
1727 c->put_qpel_pixels_tab[1][ 3]= put_qpel8_mc30_c;
1728 c->put_qpel_pixels_tab[1][ 4]= put_qpel8_mc01_c;
1729 c->put_qpel_pixels_tab[1][ 5]= put_qpel8_mc11_c;
1730 c->put_qpel_pixels_tab[1][ 6]= put_qpel8_mc21_c;
1731 c->put_qpel_pixels_tab[1][ 7]= put_qpel8_mc31_c;
1732 c->put_qpel_pixels_tab[1][ 8]= put_qpel8_mc02_c;
1733 c->put_qpel_pixels_tab[1][ 9]= put_qpel8_mc12_c;
1734 c->put_qpel_pixels_tab[1][10]= put_qpel8_mc22_c;
1735 c->put_qpel_pixels_tab[1][11]= put_qpel8_mc32_c;
1736 c->put_qpel_pixels_tab[1][12]= put_qpel8_mc03_c;
1737 c->put_qpel_pixels_tab[1][13]= put_qpel8_mc13_c;
1738 c->put_qpel_pixels_tab[1][14]= put_qpel8_mc23_c;
1739 c->put_qpel_pixels_tab[1][15]= put_qpel8_mc33_c;
1740
1741 c->put_no_rnd_qpel_pixels_tab[1][ 0]= put_no_rnd_qpel8_mc00_c;
1742 c->put_no_rnd_qpel_pixels_tab[1][ 1]= put_no_rnd_qpel8_mc10_c;
1743 c->put_no_rnd_qpel_pixels_tab[1][ 2]= put_no_rnd_qpel8_mc20_c;
1744 c->put_no_rnd_qpel_pixels_tab[1][ 3]= put_no_rnd_qpel8_mc30_c;
1745 c->put_no_rnd_qpel_pixels_tab[1][ 4]= put_no_rnd_qpel8_mc01_c;
1746 c->put_no_rnd_qpel_pixels_tab[1][ 5]= put_no_rnd_qpel8_mc11_c;
1747 c->put_no_rnd_qpel_pixels_tab[1][ 6]= put_no_rnd_qpel8_mc21_c;
1748 c->put_no_rnd_qpel_pixels_tab[1][ 7]= put_no_rnd_qpel8_mc31_c;
1749 c->put_no_rnd_qpel_pixels_tab[1][ 8]= put_no_rnd_qpel8_mc02_c;
1750 c->put_no_rnd_qpel_pixels_tab[1][ 9]= put_no_rnd_qpel8_mc12_c;
1751 c->put_no_rnd_qpel_pixels_tab[1][10]= put_no_rnd_qpel8_mc22_c;
1752 c->put_no_rnd_qpel_pixels_tab[1][11]= put_no_rnd_qpel8_mc32_c;
1753 c->put_no_rnd_qpel_pixels_tab[1][12]= put_no_rnd_qpel8_mc03_c;
1754 c->put_no_rnd_qpel_pixels_tab[1][13]= put_no_rnd_qpel8_mc13_c;
1755 c->put_no_rnd_qpel_pixels_tab[1][14]= put_no_rnd_qpel8_mc23_c;
1756 c->put_no_rnd_qpel_pixels_tab[1][15]= put_no_rnd_qpel8_mc33_c;
1757
1758 c->avg_qpel_pixels_tab[1][ 0]= avg_qpel8_mc00_c;
1759 c->avg_qpel_pixels_tab[1][ 1]= avg_qpel8_mc10_c;
1760 c->avg_qpel_pixels_tab[1][ 2]= avg_qpel8_mc20_c;
1761 c->avg_qpel_pixels_tab[1][ 3]= avg_qpel8_mc30_c;
1762 c->avg_qpel_pixels_tab[1][ 4]= avg_qpel8_mc01_c;
1763 c->avg_qpel_pixels_tab[1][ 5]= avg_qpel8_mc11_c;
1764 c->avg_qpel_pixels_tab[1][ 6]= avg_qpel8_mc21_c;
1765 c->avg_qpel_pixels_tab[1][ 7]= avg_qpel8_mc31_c;
1766 c->avg_qpel_pixels_tab[1][ 8]= avg_qpel8_mc02_c;
1767 c->avg_qpel_pixels_tab[1][ 9]= avg_qpel8_mc12_c;
1768 c->avg_qpel_pixels_tab[1][10]= avg_qpel8_mc22_c;
1769 c->avg_qpel_pixels_tab[1][11]= avg_qpel8_mc32_c;
1770 c->avg_qpel_pixels_tab[1][12]= avg_qpel8_mc03_c;
1771 c->avg_qpel_pixels_tab[1][13]= avg_qpel8_mc13_c;
1772 c->avg_qpel_pixels_tab[1][14]= avg_qpel8_mc23_c;
1773 c->avg_qpel_pixels_tab[1][15]= avg_qpel8_mc33_c;
1774 /*
1775 c->avg_no_rnd_qpel_pixels_tab[1][ 0]= avg_no_rnd_qpel8_mc00_c;
1776 c->avg_no_rnd_qpel_pixels_tab[1][ 1]= avg_no_rnd_qpel8_mc10_c;
1777 c->avg_no_rnd_qpel_pixels_tab[1][ 2]= avg_no_rnd_qpel8_mc20_c;
1778 c->avg_no_rnd_qpel_pixels_tab[1][ 3]= avg_no_rnd_qpel8_mc30_c;
1779 c->avg_no_rnd_qpel_pixels_tab[1][ 4]= avg_no_rnd_qpel8_mc01_c;
1780 c->avg_no_rnd_qpel_pixels_tab[1][ 5]= avg_no_rnd_qpel8_mc11_c;
1781 c->avg_no_rnd_qpel_pixels_tab[1][ 6]= avg_no_rnd_qpel8_mc21_c;
1782 c->avg_no_rnd_qpel_pixels_tab[1][ 7]= avg_no_rnd_qpel8_mc31_c;
1783 c->avg_no_rnd_qpel_pixels_tab[1][ 8]= avg_no_rnd_qpel8_mc02_c;
1784 c->avg_no_rnd_qpel_pixels_tab[1][ 9]= avg_no_rnd_qpel8_mc12_c;
1785 c->avg_no_rnd_qpel_pixels_tab[1][10]= avg_no_rnd_qpel8_mc22_c;
1786 c->avg_no_rnd_qpel_pixels_tab[1][11]= avg_no_rnd_qpel8_mc32_c;
1787 c->avg_no_rnd_qpel_pixels_tab[1][12]= avg_no_rnd_qpel8_mc03_c;
1788 c->avg_no_rnd_qpel_pixels_tab[1][13]= avg_no_rnd_qpel8_mc13_c;
1789 c->avg_no_rnd_qpel_pixels_tab[1][14]= avg_no_rnd_qpel8_mc23_c;
1790 c->avg_no_rnd_qpel_pixels_tab[1][15]= avg_no_rnd_qpel8_mc33_c;
1791 */
1792
1793 #ifdef HAVE_MMX
1794 dsputil_init_mmx(c, mask);
1795 if (ff_bit_exact)
1796 {
1797 /* FIXME - AVCodec context should have flag for bitexact match */
1798 /* fprintf(stderr, "\n\n\nff_bit_exact %d\n\n\n\n", ff_bit_exact); */
1799 dsputil_set_bit_exact_mmx(c, mask);
1800 }
1801 #endif
1802 #ifdef ARCH_ARMV4L
1803 dsputil_init_armv4l(c, mask);
1804 #endif
1805 #ifdef HAVE_MLIB
1806 dsputil_init_mlib(c, mask);
1807 #endif
1808 #ifdef ARCH_ALPHA
1809 dsputil_init_alpha(c, mask);
1810 #endif
1811 #ifdef ARCH_POWERPC
1812 dsputil_init_ppc(c, mask);
1813 #endif
1814 #ifdef HAVE_MMI
1815 dsputil_init_mmi(c, mask);
1816 #endif
1817
1818 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
1819 }
1820
1821 /* remove any non bit exact operation (testing purpose) */
1822 void avcodec_set_bit_exact(void)
1823 {
1824 ff_bit_exact=1;
1825 #ifdef HAVE_MMX
1826 // FIXME - better set_bit_exact
1827 // dsputil_set_bit_exact_mmx();
1828 #endif
1829 }
1830
1831 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1832 int orig_linesize[3], int coded_linesize,
1833 AVCodecContext *avctx)
1834 {
1835 int quad, diff, x, y;
1836 UINT8 *orig, *coded;
1837 UINT32 *sq = squareTbl + 256;
1838
1839 quad = 0;
1840 diff = 0;
1841
1842 /* Luminance */
1843 orig = orig_image[0];
1844 coded = coded_image[0];
1845
1846 for (y=0;y<avctx->height;y++) {
1847 for (x=0;x<avctx->width;x++) {
1848 diff = *(orig + x) - *(coded + x);
1849 quad += sq[diff];
1850 }
1851 orig += orig_linesize[0];
1852 coded += coded_linesize;
1853 }
1854
1855 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1856
1857 if (avctx->psnr_y) {
1858 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1859 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1860 } else
1861 avctx->psnr_y = 99.99;
1862 }
1863