huffyuv
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
45553457 23
5596c60c
MN
24int ff_bit_exact=0;
25
0cfa9713 26UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
27UINT32 squareTbl[512];
28
2ad1516a
MN
29const UINT8 ff_zigzag_direct[64] = {
30 0, 1, 8, 16, 9, 2, 3, 10,
31 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 32 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 33 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
34 35, 42, 49, 56, 57, 50, 43, 36,
35 29, 22, 15, 23, 30, 37, 44, 51,
36 58, 59, 52, 45, 38, 31, 39, 46,
37 53, 60, 61, 54, 47, 55, 62, 63
38};
39
2f349de2
MN
40/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
41UINT16 __align8 inv_zigzag_direct16[64];
42
2ad1516a
MN
43const UINT8 ff_alternate_horizontal_scan[64] = {
44 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
45 10, 11, 4, 5, 6, 7, 15, 14,
46 13, 12, 19, 18, 24, 25, 32, 33,
47 26, 27, 20, 21, 22, 23, 28, 29,
48 30, 31, 34, 35, 40, 41, 48, 49,
49 42, 43, 36, 37, 38, 39, 44, 45,
50 46, 47, 50, 51, 56, 57, 58, 59,
51 52, 53, 54, 55, 60, 61, 62, 63,
52};
53
2ad1516a
MN
54const UINT8 ff_alternate_vertical_scan[64] = {
55 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
56 17, 25, 32, 40, 48, 56, 57, 49,
57 41, 33, 26, 18, 3, 11, 4, 12,
58 19, 27, 34, 42, 50, 58, 35, 43,
59 51, 59, 20, 28, 5, 13, 6, 14,
60 21, 29, 36, 44, 52, 60, 37, 45,
61 53, 61, 22, 30, 7, 15, 23, 31,
62 38, 46, 54, 62, 39, 47, 55, 63,
63};
64
2f349de2 65/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
eb4b3dd3 66const UINT32 inverse[256]={
2f349de2
MN
67 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
68 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
69 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
70 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
71 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
72 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
73 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
74 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
75 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
76 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
77 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
78 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
79 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
80 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
81 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
82 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
83 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
84 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
85 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
86 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
87 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
88 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
89 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
90 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
91 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
92 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
93 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
94 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
95 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
96 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
97 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
98 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
99};
100
eb4b3dd3 101static int pix_sum_c(UINT8 * pix, int line_size)
3aa102be
MN
102{
103 int s, i, j;
104
105 s = 0;
106 for (i = 0; i < 16; i++) {
107 for (j = 0; j < 16; j += 8) {
108 s += pix[0];
109 s += pix[1];
110 s += pix[2];
111 s += pix[3];
112 s += pix[4];
113 s += pix[5];
114 s += pix[6];
115 s += pix[7];
116 pix += 8;
117 }
118 pix += line_size - 16;
119 }
120 return s;
121}
122
eb4b3dd3 123static int pix_norm1_c(UINT8 * pix, int line_size)
3aa102be
MN
124{
125 int s, i, j;
126 UINT32 *sq = squareTbl + 256;
127
128 s = 0;
129 for (i = 0; i < 16; i++) {
130 for (j = 0; j < 16; j += 8) {
131 s += sq[pix[0]];
132 s += sq[pix[1]];
133 s += sq[pix[2]];
134 s += sq[pix[3]];
135 s += sq[pix[4]];
136 s += sq[pix[5]];
137 s += sq[pix[6]];
138 s += sq[pix[7]];
139 pix += 8;
140 }
141 pix += line_size - 16;
142 }
143 return s;
144}
145
146
eb4b3dd3 147static void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
de6d9b64 148{
de6d9b64
FB
149 int i;
150
151 /* read the pixels */
de6d9b64 152 for(i=0;i<8;i++) {
c13e1abd
FH
153 block[0] = pixels[0];
154 block[1] = pixels[1];
155 block[2] = pixels[2];
156 block[3] = pixels[3];
157 block[4] = pixels[4];
158 block[5] = pixels[5];
159 block[6] = pixels[6];
160 block[7] = pixels[7];
161 pixels += line_size;
162 block += 8;
de6d9b64
FB
163 }
164}
165
eb4b3dd3
ZK
166static void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1,
167 const UINT8 *s2, int stride){
9dbcbd92
MN
168 int i;
169
170 /* read the pixels */
9dbcbd92 171 for(i=0;i<8;i++) {
c13e1abd
FH
172 block[0] = s1[0] - s2[0];
173 block[1] = s1[1] - s2[1];
174 block[2] = s1[2] - s2[2];
175 block[3] = s1[3] - s2[3];
176 block[4] = s1[4] - s2[4];
177 block[5] = s1[5] - s2[5];
178 block[6] = s1[6] - s2[6];
179 block[7] = s1[7] - s2[7];
9dbcbd92
MN
180 s1 += stride;
181 s2 += stride;
c13e1abd 182 block += 8;
9dbcbd92
MN
183 }
184}
185
186
eb4b3dd3
ZK
187static void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
188 int line_size)
de6d9b64 189{
de6d9b64
FB
190 int i;
191 UINT8 *cm = cropTbl + MAX_NEG_CROP;
192
193 /* read the pixels */
de6d9b64 194 for(i=0;i<8;i++) {
c13e1abd
FH
195 pixels[0] = cm[block[0]];
196 pixels[1] = cm[block[1]];
197 pixels[2] = cm[block[2]];
198 pixels[3] = cm[block[3]];
199 pixels[4] = cm[block[4]];
200 pixels[5] = cm[block[5]];
201 pixels[6] = cm[block[6]];
202 pixels[7] = cm[block[7]];
203
204 pixels += line_size;
205 block += 8;
de6d9b64
FB
206 }
207}
208
eb4b3dd3 209static void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
c13e1abd 210 int line_size)
de6d9b64 211{
de6d9b64
FB
212 int i;
213 UINT8 *cm = cropTbl + MAX_NEG_CROP;
214
215 /* read the pixels */
de6d9b64 216 for(i=0;i<8;i++) {
c13e1abd
FH
217 pixels[0] = cm[pixels[0] + block[0]];
218 pixels[1] = cm[pixels[1] + block[1]];
219 pixels[2] = cm[pixels[2] + block[2]];
220 pixels[3] = cm[pixels[3] + block[3]];
221 pixels[4] = cm[pixels[4] + block[4]];
222 pixels[5] = cm[pixels[5] + block[5]];
223 pixels[6] = cm[pixels[6] + block[6]];
224 pixels[7] = cm[pixels[7] + block[7]];
225 pixels += line_size;
226 block += 8;
de6d9b64
FB
227 }
228}
59fe111e
MN
229#if 0
230
231#define PIXOP2(OPNAME, OP) \
b3184779 232static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
233{\
234 int i;\
235 for(i=0; i<h; i++){\
236 OP(*((uint64_t*)block), LD64(pixels));\
237 pixels+=line_size;\
238 block +=line_size;\
239 }\
240}\
241\
45553457 242static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
243{\
244 int i;\
245 for(i=0; i<h; i++){\
246 const uint64_t a= LD64(pixels );\
247 const uint64_t b= LD64(pixels+1);\
248 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
249 pixels+=line_size;\
250 block +=line_size;\
251 }\
252}\
253\
45553457 254static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
255{\
256 int i;\
257 for(i=0; i<h; i++){\
258 const uint64_t a= LD64(pixels );\
259 const uint64_t b= LD64(pixels+1);\
260 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
261 pixels+=line_size;\
262 block +=line_size;\
263 }\
264}\
265\
45553457 266static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
267{\
268 int i;\
269 for(i=0; i<h; i++){\
270 const uint64_t a= LD64(pixels );\
271 const uint64_t b= LD64(pixels+line_size);\
272 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
273 pixels+=line_size;\
274 block +=line_size;\
275 }\
276}\
277\
45553457 278static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
279{\
280 int i;\
281 for(i=0; i<h; i++){\
282 const uint64_t a= LD64(pixels );\
283 const uint64_t b= LD64(pixels+line_size);\
284 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
285 pixels+=line_size;\
286 block +=line_size;\
287 }\
288}\
289\
45553457 290static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
291{\
292 int i;\
293 const uint64_t a= LD64(pixels );\
294 const uint64_t b= LD64(pixels+1);\
295 uint64_t l0= (a&0x0303030303030303ULL)\
296 + (b&0x0303030303030303ULL)\
297 + 0x0202020202020202ULL;\
298 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
299 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
300 uint64_t l1,h1;\
301\
302 pixels+=line_size;\
303 for(i=0; i<h; i+=2){\
304 uint64_t a= LD64(pixels );\
305 uint64_t b= LD64(pixels+1);\
306 l1= (a&0x0303030303030303ULL)\
307 + (b&0x0303030303030303ULL);\
308 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
309 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
310 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
311 pixels+=line_size;\
312 block +=line_size;\
313 a= LD64(pixels );\
314 b= LD64(pixels+1);\
315 l0= (a&0x0303030303030303ULL)\
316 + (b&0x0303030303030303ULL)\
317 + 0x0202020202020202ULL;\
318 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
319 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
320 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
321 pixels+=line_size;\
322 block +=line_size;\
323 }\
324}\
325\
45553457 326static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
327{\
328 int i;\
329 const uint64_t a= LD64(pixels );\
330 const uint64_t b= LD64(pixels+1);\
331 uint64_t l0= (a&0x0303030303030303ULL)\
332 + (b&0x0303030303030303ULL)\
333 + 0x0101010101010101ULL;\
334 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
335 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
336 uint64_t l1,h1;\
337\
338 pixels+=line_size;\
339 for(i=0; i<h; i+=2){\
340 uint64_t a= LD64(pixels );\
341 uint64_t b= LD64(pixels+1);\
342 l1= (a&0x0303030303030303ULL)\
343 + (b&0x0303030303030303ULL);\
344 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
345 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
346 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
347 pixels+=line_size;\
348 block +=line_size;\
349 a= LD64(pixels );\
350 b= LD64(pixels+1);\
351 l0= (a&0x0303030303030303ULL)\
352 + (b&0x0303030303030303ULL)\
353 + 0x0101010101010101ULL;\
354 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
355 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
356 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
357 pixels+=line_size;\
358 block +=line_size;\
359 }\
360}\
361\
45553457
ZK
362CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
363CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
364CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
365CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
366CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
367CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
368CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
369
370#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
371#else // 64 bit variant
372
373#define PIXOP2(OPNAME, OP) \
45553457 374static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
375 int i;\
376 for(i=0; i<h; i++){\
377 OP(*((uint32_t*)(block )), LD32(pixels ));\
378 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
379 pixels+=line_size;\
380 block +=line_size;\
381 }\
382}\
45553457
ZK
383static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
384 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 385}\
59fe111e 386\
b3184779
MN
387static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
388 int src_stride1, int src_stride2, int h){\
59fe111e
MN
389 int i;\
390 for(i=0; i<h; i++){\
b3184779
MN
391 uint32_t a,b;\
392 a= LD32(&src1[i*src_stride1 ]);\
393 b= LD32(&src2[i*src_stride2 ]);\
394 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
395 a= LD32(&src1[i*src_stride1+4]);\
396 b= LD32(&src2[i*src_stride2+4]);\
397 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
398 }\
399}\
400\
b3184779
MN
401static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
402 int src_stride1, int src_stride2, int h){\
59fe111e
MN
403 int i;\
404 for(i=0; i<h; i++){\
b3184779
MN
405 uint32_t a,b;\
406 a= LD32(&src1[i*src_stride1 ]);\
407 b= LD32(&src2[i*src_stride2 ]);\
408 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
409 a= LD32(&src1[i*src_stride1+4]);\
410 b= LD32(&src2[i*src_stride2+4]);\
411 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
412 }\
413}\
414\
b3184779
MN
415static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
416 int src_stride1, int src_stride2, int h){\
417 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
418 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
419}\
420\
421static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
422 int src_stride1, int src_stride2, int h){\
423 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
424 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
425}\
426\
45553457 427static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
428 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
429}\
430\
45553457 431static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
432 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
433}\
434\
45553457 435static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
436 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
437}\
438\
45553457 439static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
440 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
441}\
442\
443static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
444 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
445 int i;\
446 for(i=0; i<h; i++){\
b3184779
MN
447 uint32_t a, b, c, d, l0, l1, h0, h1;\
448 a= LD32(&src1[i*src_stride1]);\
449 b= LD32(&src2[i*src_stride2]);\
450 c= LD32(&src3[i*src_stride3]);\
451 d= LD32(&src4[i*src_stride4]);\
452 l0= (a&0x03030303UL)\
453 + (b&0x03030303UL)\
454 + 0x02020202UL;\
455 h0= ((a&0xFCFCFCFCUL)>>2)\
456 + ((b&0xFCFCFCFCUL)>>2);\
457 l1= (c&0x03030303UL)\
458 + (d&0x03030303UL);\
459 h1= ((c&0xFCFCFCFCUL)>>2)\
460 + ((d&0xFCFCFCFCUL)>>2);\
461 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
462 a= LD32(&src1[i*src_stride1+4]);\
463 b= LD32(&src2[i*src_stride2+4]);\
464 c= LD32(&src3[i*src_stride3+4]);\
465 d= LD32(&src4[i*src_stride4+4]);\
466 l0= (a&0x03030303UL)\
467 + (b&0x03030303UL)\
468 + 0x02020202UL;\
469 h0= ((a&0xFCFCFCFCUL)>>2)\
470 + ((b&0xFCFCFCFCUL)>>2);\
471 l1= (c&0x03030303UL)\
472 + (d&0x03030303UL);\
473 h1= ((c&0xFCFCFCFCUL)>>2)\
474 + ((d&0xFCFCFCFCUL)>>2);\
475 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
476 }\
477}\
b3184779
MN
478static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
479 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
480 int i;\
481 for(i=0; i<h; i++){\
b3184779
MN
482 uint32_t a, b, c, d, l0, l1, h0, h1;\
483 a= LD32(&src1[i*src_stride1]);\
484 b= LD32(&src2[i*src_stride2]);\
485 c= LD32(&src3[i*src_stride3]);\
486 d= LD32(&src4[i*src_stride4]);\
487 l0= (a&0x03030303UL)\
488 + (b&0x03030303UL)\
489 + 0x01010101UL;\
490 h0= ((a&0xFCFCFCFCUL)>>2)\
491 + ((b&0xFCFCFCFCUL)>>2);\
492 l1= (c&0x03030303UL)\
493 + (d&0x03030303UL);\
494 h1= ((c&0xFCFCFCFCUL)>>2)\
495 + ((d&0xFCFCFCFCUL)>>2);\
496 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
497 a= LD32(&src1[i*src_stride1+4]);\
498 b= LD32(&src2[i*src_stride2+4]);\
499 c= LD32(&src3[i*src_stride3+4]);\
500 d= LD32(&src4[i*src_stride4+4]);\
501 l0= (a&0x03030303UL)\
502 + (b&0x03030303UL)\
503 + 0x01010101UL;\
504 h0= ((a&0xFCFCFCFCUL)>>2)\
505 + ((b&0xFCFCFCFCUL)>>2);\
506 l1= (c&0x03030303UL)\
507 + (d&0x03030303UL);\
508 h1= ((c&0xFCFCFCFCUL)>>2)\
509 + ((d&0xFCFCFCFCUL)>>2);\
510 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
511 }\
512}\
b3184779
MN
513static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
514 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
515 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
516 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
517}\
518static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
519 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
520 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
521 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
522}\
59fe111e 523\
45553457 524static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
525{\
526 int j;\
527 for(j=0; j<2; j++){\
528 int i;\
529 const uint32_t a= LD32(pixels );\
530 const uint32_t b= LD32(pixels+1);\
531 uint32_t l0= (a&0x03030303UL)\
532 + (b&0x03030303UL)\
533 + 0x02020202UL;\
534 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
535 + ((b&0xFCFCFCFCUL)>>2);\
536 uint32_t l1,h1;\
537\
538 pixels+=line_size;\
539 for(i=0; i<h; i+=2){\
540 uint32_t a= LD32(pixels );\
541 uint32_t b= LD32(pixels+1);\
542 l1= (a&0x03030303UL)\
543 + (b&0x03030303UL);\
544 h1= ((a&0xFCFCFCFCUL)>>2)\
545 + ((b&0xFCFCFCFCUL)>>2);\
546 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
547 pixels+=line_size;\
548 block +=line_size;\
549 a= LD32(pixels );\
550 b= LD32(pixels+1);\
551 l0= (a&0x03030303UL)\
552 + (b&0x03030303UL)\
553 + 0x02020202UL;\
554 h0= ((a&0xFCFCFCFCUL)>>2)\
555 + ((b&0xFCFCFCFCUL)>>2);\
556 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
557 pixels+=line_size;\
558 block +=line_size;\
559 }\
560 pixels+=4-line_size*(h+1);\
561 block +=4-line_size*h;\
562 }\
563}\
564\
45553457 565static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
566{\
567 int j;\
568 for(j=0; j<2; j++){\
569 int i;\
570 const uint32_t a= LD32(pixels );\
571 const uint32_t b= LD32(pixels+1);\
572 uint32_t l0= (a&0x03030303UL)\
573 + (b&0x03030303UL)\
574 + 0x01010101UL;\
575 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
576 + ((b&0xFCFCFCFCUL)>>2);\
577 uint32_t l1,h1;\
578\
579 pixels+=line_size;\
580 for(i=0; i<h; i+=2){\
581 uint32_t a= LD32(pixels );\
582 uint32_t b= LD32(pixels+1);\
583 l1= (a&0x03030303UL)\
584 + (b&0x03030303UL);\
585 h1= ((a&0xFCFCFCFCUL)>>2)\
586 + ((b&0xFCFCFCFCUL)>>2);\
587 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
588 pixels+=line_size;\
589 block +=line_size;\
590 a= LD32(pixels );\
591 b= LD32(pixels+1);\
592 l0= (a&0x03030303UL)\
593 + (b&0x03030303UL)\
594 + 0x01010101UL;\
595 h0= ((a&0xFCFCFCFCUL)>>2)\
596 + ((b&0xFCFCFCFCUL)>>2);\
597 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
598 pixels+=line_size;\
599 block +=line_size;\
600 }\
601 pixels+=4-line_size*(h+1);\
602 block +=4-line_size*h;\
603 }\
604}\
605\
45553457
ZK
606CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
607CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
608CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
609CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
610CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
611CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
612CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
613CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 614
59fe111e
MN
615#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
616#endif
59fe111e
MN
617#define op_put(a, b) a = b
618
619PIXOP2(avg, op_avg)
620PIXOP2(put, op_put)
621#undef op_avg
622#undef op_put
623
de6d9b64
FB
624#define avg2(a,b) ((a+b+1)>>1)
625#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
626
073b013d 627
b3184779 628static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
629{
630 const int A=(16-x16)*(16-y16);
631 const int B=( x16)*(16-y16);
632 const int C=(16-x16)*( y16);
633 const int D=( x16)*( y16);
634 int i;
44eb4951
MN
635
636 for(i=0; i<h; i++)
637 {
b3184779
MN
638 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
639 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
640 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
641 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
642 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
643 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
644 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
645 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
646 dst+= stride;
647 src+= stride;
44eb4951
MN
648 }
649}
650
073b013d
MN
651static void gmc_c(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy,
652 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
653{
654 int y, vx, vy;
655 const int s= 1<<shift;
656
657 width--;
658 height--;
659
660 for(y=0; y<h; y++){
661 int x;
662
663 vx= ox;
664 vy= oy;
665 for(x=0; x<8; x++){ //XXX FIXME optimize
666 int src_x, src_y, frac_x, frac_y, index;
667
668 src_x= vx>>16;
669 src_y= vy>>16;
670 frac_x= src_x&(s-1);
671 frac_y= src_y&(s-1);
672 src_x>>=shift;
673 src_y>>=shift;
674
675 if((unsigned)src_x < width){
676 if((unsigned)src_y < height){
677 index= src_x + src_y*stride;
678 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
679 + src[index +1]* frac_x )*(s-frac_y)
680 + ( src[index+stride ]*(s-frac_x)
681 + src[index+stride+1]* frac_x )* frac_y
682 + r)>>(shift*2);
683 }else{
684 index= src_x + clip(src_y, 0, height)*stride;
685 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
686 + src[index +1]* frac_x )*s
687 + r)>>(shift*2);
688 }
689 }else{
690 if((unsigned)src_y < height){
691 index= clip(src_x, 0, width) + src_y*stride;
692 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
693 + src[index+stride ]* frac_y )*s
694 + r)>>(shift*2);
695 }else{
696 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
697 dst[y*stride + x]= src[index ];
698 }
699 }
700
701 vx+= dxx;
702 vy+= dyx;
703 }
704 ox += dxy;
705 oy += dyy;
706 }
707}
708
b3184779 709static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951 710{
44eb4951
MN
711 int i;
712 for(i=0; i<h; i++)
713 {
b3184779
MN
714 ST32(dst , LD32(src ));
715 ST32(dst+4 , LD32(src+4 ));
716 ST32(dst+8 , LD32(src+8 ));
717 ST32(dst+12, LD32(src+12));
718 dst[16]= src[16];
44eb4951
MN
719 dst+=dstStride;
720 src+=srcStride;
721 }
722}
723
b3184779 724static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951
MN
725{
726 int i;
b3184779 727 for(i=0; i<h; i++)
44eb4951 728 {
b3184779
MN
729 ST32(dst , LD32(src ));
730 ST32(dst+4 , LD32(src+4 ));
731 dst[8]= src[8];
44eb4951
MN
732 dst+=dstStride;
733 src+=srcStride;
734 }
735}
736
b3184779
MN
737#define QPEL_MC(r, OPNAME, RND, OP) \
738static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
739 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
740 int i;\
741 for(i=0; i<h; i++)\
742 {\
743 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
744 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
745 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
746 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
747 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
748 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
749 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
750 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
751 dst+=dstStride;\
752 src+=srcStride;\
753 }\
44eb4951
MN
754}\
755\
b3184779
MN
756static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
757 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
758 int i;\
759 for(i=0; i<w; i++)\
760 {\
761 const int src0= src[0*srcStride];\
762 const int src1= src[1*srcStride];\
763 const int src2= src[2*srcStride];\
764 const int src3= src[3*srcStride];\
765 const int src4= src[4*srcStride];\
766 const int src5= src[5*srcStride];\
767 const int src6= src[6*srcStride];\
768 const int src7= src[7*srcStride];\
769 const int src8= src[8*srcStride];\
770 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
771 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
772 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
773 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
774 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
775 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
776 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
777 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
778 dst++;\
779 src++;\
780 }\
781}\
782\
783static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
784 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
785 int i;\
786 for(i=0; i<h; i++)\
787 {\
788 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
789 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
790 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
791 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
792 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
793 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
794 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
795 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
796 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
797 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
798 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
799 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
800 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
801 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
802 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
803 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
804 dst+=dstStride;\
805 src+=srcStride;\
806 }\
807}\
808\
809static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
810 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
811 int i;\
812 for(i=0; i<w; i++)\
813 {\
814 const int src0= src[0*srcStride];\
815 const int src1= src[1*srcStride];\
816 const int src2= src[2*srcStride];\
817 const int src3= src[3*srcStride];\
818 const int src4= src[4*srcStride];\
819 const int src5= src[5*srcStride];\
820 const int src6= src[6*srcStride];\
821 const int src7= src[7*srcStride];\
822 const int src8= src[8*srcStride];\
823 const int src9= src[9*srcStride];\
824 const int src10= src[10*srcStride];\
825 const int src11= src[11*srcStride];\
826 const int src12= src[12*srcStride];\
827 const int src13= src[13*srcStride];\
828 const int src14= src[14*srcStride];\
829 const int src15= src[15*srcStride];\
830 const int src16= src[16*srcStride];\
831 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
832 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
833 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
834 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
835 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
836 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
837 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
838 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
839 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
840 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
841 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
842 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
843 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
844 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
845 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
846 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
847 dst++;\
848 src++;\
849 }\
850}\
851\
852static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
45553457 853 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
854}\
855\
856static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 857 UINT8 half[64];\
b3184779
MN
858 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
859 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
860}\
861\
b3184779
MN
862static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
863 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
864}\
865\
b3184779 866static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 867 UINT8 half[64];\
b3184779
MN
868 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
869 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
870}\
871\
b3184779
MN
872static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
873 UINT8 full[16*9];\
44eb4951 874 UINT8 half[64];\
b3184779
MN
875 copy_block9(full, src, 16, stride, 9);\
876 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
877 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
878}\
879\
b3184779
MN
880static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
881 UINT8 full[16*9];\
882 copy_block9(full, src, 16, stride, 9);\
883 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
44eb4951
MN
884}\
885\
b3184779
MN
886static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
887 UINT8 full[16*9];\
44eb4951 888 UINT8 half[64];\
b3184779
MN
889 copy_block9(full, src, 16, stride, 9);\
890 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
891 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 892}\
b3184779
MN
893static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
894 UINT8 full[16*9];\
44eb4951 895 UINT8 halfH[72];\
7ff037e9 896 UINT8 halfV[64];\
44eb4951 897 UINT8 halfHV[64];\
b3184779
MN
898 copy_block9(full, src, 16, stride, 9);\
899 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
900 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
901 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
902 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 903}\
b3184779
MN
904static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
905 UINT8 full[16*9];\
44eb4951 906 UINT8 halfH[72];\
7ff037e9 907 UINT8 halfV[64];\
44eb4951 908 UINT8 halfHV[64];\
b3184779
MN
909 copy_block9(full, src, 16, stride, 9);\
910 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
911 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
912 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
913 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 914}\
b3184779
MN
915static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
916 UINT8 full[16*9];\
44eb4951 917 UINT8 halfH[72];\
7ff037e9 918 UINT8 halfV[64];\
44eb4951 919 UINT8 halfHV[64];\
b3184779
MN
920 copy_block9(full, src, 16, stride, 9);\
921 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
922 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
923 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
924 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 925}\
b3184779
MN
926static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
927 UINT8 full[16*9];\
44eb4951 928 UINT8 halfH[72];\
7ff037e9 929 UINT8 halfV[64];\
44eb4951 930 UINT8 halfHV[64];\
b3184779
MN
931 copy_block9(full, src, 16, stride, 9);\
932 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
933 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
934 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
935 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 936}\
b3184779 937static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
938 UINT8 halfH[72];\
939 UINT8 halfHV[64];\
b3184779
MN
940 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
941 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
942 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 943}\
b3184779 944static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
945 UINT8 halfH[72];\
946 UINT8 halfHV[64];\
b3184779
MN
947 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
948 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
949 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 950}\
b3184779
MN
951static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
952 UINT8 full[16*9];\
44eb4951 953 UINT8 halfH[72];\
7ff037e9 954 UINT8 halfV[64];\
44eb4951 955 UINT8 halfHV[64];\
b3184779
MN
956 copy_block9(full, src, 16, stride, 9);\
957 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
958 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
959 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
960 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 961}\
b3184779
MN
962static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
963 UINT8 full[16*9];\
44eb4951 964 UINT8 halfH[72];\
7ff037e9 965 UINT8 halfV[64];\
44eb4951 966 UINT8 halfHV[64];\
b3184779
MN
967 copy_block9(full, src, 16, stride, 9);\
968 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
969 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
970 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
971 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 972}\
b3184779 973static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 974 UINT8 halfH[72];\
b3184779
MN
975 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
976 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
977}\
978static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
45553457 979 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
980}\
981\
982static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
983 UINT8 half[256];\
984 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
985 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
986}\
987\
988static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
989 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 990}\
b3184779
MN
991\
992static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
993 UINT8 half[256];\
994 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
995 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
996}\
997\
998static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
999 UINT8 full[24*17];\
1000 UINT8 half[256];\
1001 copy_block17(full, src, 24, stride, 17);\
1002 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1003 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1004}\
1005\
1006static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1007 UINT8 full[24*17];\
1008 copy_block17(full, src, 24, stride, 17);\
1009 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1010}\
1011\
1012static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1013 UINT8 full[24*17];\
1014 UINT8 half[256];\
1015 copy_block17(full, src, 24, stride, 17);\
1016 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1017 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1018}\
1019static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1020 UINT8 full[24*17];\
1021 UINT8 halfH[272];\
1022 UINT8 halfV[256];\
1023 UINT8 halfHV[256];\
1024 copy_block17(full, src, 24, stride, 17);\
1025 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1026 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1027 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1028 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1029}\
1030static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1031 UINT8 full[24*17];\
1032 UINT8 halfH[272];\
1033 UINT8 halfV[256];\
1034 UINT8 halfHV[256];\
1035 copy_block17(full, src, 24, stride, 17);\
1036 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1037 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1038 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1039 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1040}\
1041static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1042 UINT8 full[24*17];\
1043 UINT8 halfH[272];\
1044 UINT8 halfV[256];\
1045 UINT8 halfHV[256];\
1046 copy_block17(full, src, 24, stride, 17);\
1047 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1048 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1049 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1050 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1051}\
1052static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1053 UINT8 full[24*17];\
1054 UINT8 halfH[272];\
1055 UINT8 halfV[256];\
1056 UINT8 halfHV[256];\
1057 copy_block17(full, src, 24, stride, 17);\
1058 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1059 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1060 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1061 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1062}\
1063static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1064 UINT8 halfH[272];\
1065 UINT8 halfHV[256];\
1066 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1067 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1068 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1069}\
1070static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1071 UINT8 halfH[272];\
1072 UINT8 halfHV[256];\
1073 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1074 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1075 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1076}\
1077static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1078 UINT8 full[24*17];\
1079 UINT8 halfH[272];\
1080 UINT8 halfV[256];\
1081 UINT8 halfHV[256];\
1082 copy_block17(full, src, 24, stride, 17);\
1083 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1084 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1085 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1086 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1087}\
1088static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1089 UINT8 full[24*17];\
1090 UINT8 halfH[272];\
1091 UINT8 halfV[256];\
1092 UINT8 halfHV[256];\
1093 copy_block17(full, src, 24, stride, 17);\
1094 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1095 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1096 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1097 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1098}\
1099static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1100 UINT8 halfH[272];\
1101 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1102 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
45553457 1103}
44eb4951 1104
b3184779
MN
1105#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1106#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1107#define op_put(a, b) a = cm[((b) + 16)>>5]
1108#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1109
1110QPEL_MC(0, put_ , _ , op_put)
1111QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1112QPEL_MC(0, avg_ , _ , op_avg)
1113//QPEL_MC(1, avg_no_rnd , _ , op_avg)
1114#undef op_avg
1115#undef op_avg_no_rnd
1116#undef op_put
1117#undef op_put_no_rnd
44eb4951 1118
eb4b3dd3 1119static int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1120{
1121 int s, i;
1122
1123 s = 0;
ba6802de 1124 for(i=0;i<16;i++) {
de6d9b64
FB
1125 s += abs(pix1[0] - pix2[0]);
1126 s += abs(pix1[1] - pix2[1]);
1127 s += abs(pix1[2] - pix2[2]);
1128 s += abs(pix1[3] - pix2[3]);
1129 s += abs(pix1[4] - pix2[4]);
1130 s += abs(pix1[5] - pix2[5]);
1131 s += abs(pix1[6] - pix2[6]);
1132 s += abs(pix1[7] - pix2[7]);
1133 s += abs(pix1[8] - pix2[8]);
1134 s += abs(pix1[9] - pix2[9]);
1135 s += abs(pix1[10] - pix2[10]);
1136 s += abs(pix1[11] - pix2[11]);
1137 s += abs(pix1[12] - pix2[12]);
1138 s += abs(pix1[13] - pix2[13]);
1139 s += abs(pix1[14] - pix2[14]);
1140 s += abs(pix1[15] - pix2[15]);
1141 pix1 += line_size;
1142 pix2 += line_size;
1143 }
1144 return s;
1145}
1146
eb4b3dd3 1147static int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1148{
1149 int s, i;
1150
1151 s = 0;
ba6802de 1152 for(i=0;i<16;i++) {
de6d9b64
FB
1153 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1154 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1155 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1156 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1157 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1158 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1159 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1160 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1161 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1162 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1163 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1164 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1165 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1166 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1167 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1168 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1169 pix1 += line_size;
1170 pix2 += line_size;
1171 }
1172 return s;
1173}
1174
eb4b3dd3 1175static int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1176{
1177 int s, i;
1178 UINT8 *pix3 = pix2 + line_size;
1179
1180 s = 0;
ba6802de 1181 for(i=0;i<16;i++) {
de6d9b64
FB
1182 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1183 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1184 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1185 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1186 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1187 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1188 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1189 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1190 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1191 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1192 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1193 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1194 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1195 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1196 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1197 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1198 pix1 += line_size;
1199 pix2 += line_size;
1200 pix3 += line_size;
1201 }
1202 return s;
1203}
1204
eb4b3dd3 1205static int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1206{
1207 int s, i;
1208 UINT8 *pix3 = pix2 + line_size;
1209
1210 s = 0;
ba6802de 1211 for(i=0;i<16;i++) {
de6d9b64
FB
1212 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1213 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1214 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1215 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1216 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1217 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1218 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1219 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1220 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1221 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1222 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1223 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1224 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1225 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1226 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1227 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1228 pix1 += line_size;
1229 pix2 += line_size;
1230 pix3 += line_size;
1231 }
1232 return s;
1233}
1234
eb4b3dd3 1235static int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1236{
1237 int s, i;
1238
1239 s = 0;
1240 for(i=0;i<8;i++) {
1241 s += abs(pix1[0] - pix2[0]);
1242 s += abs(pix1[1] - pix2[1]);
1243 s += abs(pix1[2] - pix2[2]);
1244 s += abs(pix1[3] - pix2[3]);
1245 s += abs(pix1[4] - pix2[4]);
1246 s += abs(pix1[5] - pix2[5]);
1247 s += abs(pix1[6] - pix2[6]);
1248 s += abs(pix1[7] - pix2[7]);
1249 pix1 += line_size;
1250 pix2 += line_size;
1251 }
1252 return s;
1253}
1254
eb4b3dd3 1255static int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1256{
1257 int s, i;
1258
1259 s = 0;
1260 for(i=0;i<8;i++) {
1261 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1262 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1263 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1264 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1265 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1266 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1267 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1268 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1269 pix1 += line_size;
1270 pix2 += line_size;
1271 }
1272 return s;
1273}
1274
eb4b3dd3 1275static int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1276{
1277 int s, i;
1278 UINT8 *pix3 = pix2 + line_size;
1279
1280 s = 0;
1281 for(i=0;i<8;i++) {
1282 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1283 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1284 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1285 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1286 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1287 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1288 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1289 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1290 pix1 += line_size;
1291 pix2 += line_size;
1292 pix3 += line_size;
1293 }
1294 return s;
1295}
1296
eb4b3dd3 1297static int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1298{
1299 int s, i;
1300 UINT8 *pix3 = pix2 + line_size;
1301
1302 s = 0;
1303 for(i=0;i<8;i++) {
1304 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1305 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1306 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1307 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1308 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1309 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1310 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1311 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1312 pix1 += line_size;
1313 pix2 += line_size;
1314 pix3 += line_size;
1315 }
1316 return s;
1317}
1318
7801d21d 1319void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last)
d962f6fd 1320{
7801d21d
MN
1321 int i;
1322 INT16 temp[64];
1323
1324 if(last<=0) return;
1325 if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 1326
7801d21d
MN
1327 for(i=0; i<=last; i++){
1328 const int j= scantable[i];
1329 temp[j]= block[j];
1330 block[j]=0;
1331 }
1332
1333 for(i=0; i<=last; i++){
1334 const int j= scantable[i];
1335 const int perm_j= permutation[j];
1336 block[perm_j]= temp[j];
1337 }
d962f6fd 1338}
e0eac44e 1339
eb4b3dd3 1340static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
1341{
1342 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1343}
1344
11f18faf
MN
1345static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1346 int i;
1347 for(i=0; i+7<w; i++){
1348 dst[i+0] += src[i+0];
1349 dst[i+1] += src[i+1];
1350 dst[i+2] += src[i+2];
1351 dst[i+3] += src[i+3];
1352 dst[i+4] += src[i+4];
1353 dst[i+5] += src[i+5];
1354 dst[i+6] += src[i+6];
1355 dst[i+7] += src[i+7];
1356 }
1357 for(; i<w; i++)
1358 dst[i+0] += src[i+0];
1359}
1360
1361static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1362 int i;
1363 for(i=0; i+7<w; i++){
1364 dst[i+0] = src1[i+0]-src2[i+0];
1365 dst[i+1] = src1[i+1]-src2[i+1];
1366 dst[i+2] = src1[i+2]-src2[i+2];
1367 dst[i+3] = src1[i+3]-src2[i+3];
1368 dst[i+4] = src1[i+4]-src2[i+4];
1369 dst[i+5] = src1[i+5]-src2[i+5];
1370 dst[i+6] = src1[i+6]-src2[i+6];
1371 dst[i+7] = src1[i+7]-src2[i+7];
1372 }
1373 for(; i<w; i++)
1374 dst[i+0] = src1[i+0]-src2[i+0];
1375}
1376
eb4b3dd3 1377void dsputil_init(DSPContext* c, unsigned mask)
e0eac44e 1378{
5abd509a 1379 static int init_done = 0;
d2975f8d 1380 int i;
e0eac44e 1381
5abd509a
ZK
1382 if (!init_done) {
1383 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1384 for(i=0;i<MAX_NEG_CROP;i++) {
1385 cropTbl[i] = 0;
1386 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1387 }
de6d9b64 1388
5abd509a
ZK
1389 for(i=0;i<512;i++) {
1390 squareTbl[i] = (i - 256) * (i - 256);
1391 }
92ddb692
ZK
1392
1393 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
1394
1395 init_done = 1;
de6d9b64
FB
1396 }
1397
eb4b3dd3
ZK
1398 c->get_pixels = get_pixels_c;
1399 c->diff_pixels = diff_pixels_c;
1400 c->put_pixels_clamped = put_pixels_clamped_c;
1401 c->add_pixels_clamped = add_pixels_clamped_c;
1402 c->gmc1 = gmc1_c;
1403 c->gmc = gmc_c;
1404 c->clear_blocks = clear_blocks_c;
1405 c->pix_sum = pix_sum_c;
1406 c->pix_norm1 = pix_norm1_c;
1407
45553457 1408 /* TODO [0] 16 [1] 8 */
eb4b3dd3
ZK
1409 c->pix_abs16x16 = pix_abs16x16_c;
1410 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
1411 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
1412 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1413 c->pix_abs8x8 = pix_abs8x8_c;
1414 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
1415 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
1416 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1417
45553457
ZK
1418#define dspfunc(PFX, IDX, NUM) \
1419 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
1420 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
1421 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
1422 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
1423
1424 dspfunc(put, 0, 16);
1425 dspfunc(put_no_rnd, 0, 16);
1426 dspfunc(put, 1, 8);
1427 dspfunc(put_no_rnd, 1, 8);
1428
1429 dspfunc(avg, 0, 16);
1430 dspfunc(avg_no_rnd, 0, 16);
1431 dspfunc(avg, 1, 8);
1432 dspfunc(avg_no_rnd, 1, 8);
1433#undef dspfunc
1434
1435#define dspfunc(PFX, IDX, NUM) \
1436 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
1437 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
1438 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
1439 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
1440 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
1441 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
1442 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
1443 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
1444 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
1445 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
1446 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
1447 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
1448 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
1449 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
1450 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
1451 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
1452
1453 dspfunc(put_qpel, 0, 16);
1454 dspfunc(put_no_rnd_qpel, 0, 16);
1455
1456 dspfunc(avg_qpel, 0, 16);
1457 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
1458
1459 dspfunc(put_qpel, 1, 8);
1460 dspfunc(put_no_rnd_qpel, 1, 8);
1461
1462 dspfunc(avg_qpel, 1, 8);
1463 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
1464#undef dspfunc
c9a2ebc4 1465
11f18faf
MN
1466 c->add_bytes= add_bytes_c;
1467 c->diff_bytes= diff_bytes_c;
1468
980fc7b8 1469#ifdef HAVE_MMX
eb4b3dd3 1470 dsputil_init_mmx(c, mask);
34dfe896
ZK
1471 if (ff_bit_exact)
1472 {
1473 /* FIXME - AVCodec context should have flag for bitexact match */
1474 /* fprintf(stderr, "\n\n\nff_bit_exact %d\n\n\n\n", ff_bit_exact); */
1475 dsputil_set_bit_exact_mmx(c, mask);
1476 }
de6d9b64 1477#endif
3d03c0a2 1478#ifdef ARCH_ARMV4L
eb4b3dd3 1479 dsputil_init_armv4l(c, mask);
3d03c0a2 1480#endif
c34270f5 1481#ifdef HAVE_MLIB
eb4b3dd3 1482 dsputil_init_mlib(c, mask);
c34270f5 1483#endif
1e98dffb 1484#ifdef ARCH_ALPHA
eb4b3dd3 1485 dsputil_init_alpha(c, mask);
1e98dffb 1486#endif
59925ef2 1487#ifdef ARCH_POWERPC
eb4b3dd3 1488 dsputil_init_ppc(c, mask);
a43bd1d7 1489#endif
d46aba26 1490#ifdef HAVE_MMI
eb4b3dd3 1491 dsputil_init_mmi(c, mask);
d46aba26 1492#endif
c34270f5 1493
de6d9b64 1494}
43f1708f 1495
57060b1e
FB
1496/* remove any non bit exact operation (testing purpose) */
1497void avcodec_set_bit_exact(void)
1498{
5596c60c 1499 ff_bit_exact=1;
57060b1e 1500#ifdef HAVE_MMX
34dfe896 1501// FIXME - better set_bit_exact
eb4b3dd3 1502// dsputil_set_bit_exact_mmx();
57060b1e
FB
1503#endif
1504}
1505
43f1708f
J
1506void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1507 int orig_linesize[3], int coded_linesize,
1508 AVCodecContext *avctx)
1509{
1510 int quad, diff, x, y;
1511 UINT8 *orig, *coded;
1512 UINT32 *sq = squareTbl + 256;
1513
1514 quad = 0;
1515 diff = 0;
1516
1517 /* Luminance */
1518 orig = orig_image[0];
1519 coded = coded_image[0];
1520
1521 for (y=0;y<avctx->height;y++) {
1522 for (x=0;x<avctx->width;x++) {
1523 diff = *(orig + x) - *(coded + x);
1524 quad += sq[diff];
1525 }
1526 orig += orig_linesize[0];
1527 coded += coded_linesize;
1528 }
1529
1530 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1531
1532 if (avctx->psnr_y) {
1533 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1534 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1535 } else
1536 avctx->psnr_y = 99.99;
1537}
1538