added release tar target
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
45553457 23
5596c60c
MN
24int ff_bit_exact=0;
25
0cfa9713 26UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
27UINT32 squareTbl[512];
28
2ad1516a
MN
29const UINT8 ff_zigzag_direct[64] = {
30 0, 1, 8, 16, 9, 2, 3, 10,
31 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 32 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 33 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
34 35, 42, 49, 56, 57, 50, 43, 36,
35 29, 22, 15, 23, 30, 37, 44, 51,
36 58, 59, 52, 45, 38, 31, 39, 46,
37 53, 60, 61, 54, 47, 55, 62, 63
38};
39
2f349de2
MN
40/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
41UINT16 __align8 inv_zigzag_direct16[64];
42
2ad1516a
MN
43const UINT8 ff_alternate_horizontal_scan[64] = {
44 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
45 10, 11, 4, 5, 6, 7, 15, 14,
46 13, 12, 19, 18, 24, 25, 32, 33,
47 26, 27, 20, 21, 22, 23, 28, 29,
48 30, 31, 34, 35, 40, 41, 48, 49,
49 42, 43, 36, 37, 38, 39, 44, 45,
50 46, 47, 50, 51, 56, 57, 58, 59,
51 52, 53, 54, 55, 60, 61, 62, 63,
52};
53
2ad1516a
MN
54const UINT8 ff_alternate_vertical_scan[64] = {
55 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
56 17, 25, 32, 40, 48, 56, 57, 49,
57 41, 33, 26, 18, 3, 11, 4, 12,
58 19, 27, 34, 42, 50, 58, 35, 43,
59 51, 59, 20, 28, 5, 13, 6, 14,
60 21, 29, 36, 44, 52, 60, 37, 45,
61 53, 61, 22, 30, 7, 15, 23, 31,
62 38, 46, 54, 62, 39, 47, 55, 63,
63};
64
2f349de2 65/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
eb4b3dd3 66const UINT32 inverse[256]={
2f349de2
MN
67 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
68 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
69 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
70 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
71 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
72 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
73 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
74 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
75 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
76 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
77 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
78 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
79 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
80 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
81 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
82 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
83 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
84 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
85 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
86 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
87 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
88 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
89 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
90 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
91 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
92 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
93 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
94 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
95 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
96 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
97 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
98 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
99};
100
eb4b3dd3 101static int pix_sum_c(UINT8 * pix, int line_size)
3aa102be
MN
102{
103 int s, i, j;
104
105 s = 0;
106 for (i = 0; i < 16; i++) {
107 for (j = 0; j < 16; j += 8) {
108 s += pix[0];
109 s += pix[1];
110 s += pix[2];
111 s += pix[3];
112 s += pix[4];
113 s += pix[5];
114 s += pix[6];
115 s += pix[7];
116 pix += 8;
117 }
118 pix += line_size - 16;
119 }
120 return s;
121}
122
eb4b3dd3 123static int pix_norm1_c(UINT8 * pix, int line_size)
3aa102be
MN
124{
125 int s, i, j;
126 UINT32 *sq = squareTbl + 256;
127
128 s = 0;
129 for (i = 0; i < 16; i++) {
130 for (j = 0; j < 16; j += 8) {
131 s += sq[pix[0]];
132 s += sq[pix[1]];
133 s += sq[pix[2]];
134 s += sq[pix[3]];
135 s += sq[pix[4]];
136 s += sq[pix[5]];
137 s += sq[pix[6]];
138 s += sq[pix[7]];
139 pix += 8;
140 }
141 pix += line_size - 16;
142 }
143 return s;
144}
145
146
9c76bd48
BF
147static int pix_norm_c(UINT8 * pix1, UINT8 * pix2, int line_size)
148{
149 int s, i, j;
150 UINT32 *sq = squareTbl + 256;
151
152 s = 0;
153 for (i = 0; i < 16; i++) {
154 for (j = 0; j < 16; j += 8) {
155 s += sq[pix1[0] - pix2[0]];
156 s += sq[pix1[1] - pix2[1]];
157 s += sq[pix1[2] - pix2[2]];
158 s += sq[pix1[3] - pix2[3]];
159 s += sq[pix1[4] - pix2[4]];
160 s += sq[pix1[5] - pix2[5]];
161 s += sq[pix1[6] - pix2[6]];
162 s += sq[pix1[7] - pix2[7]];
163 pix1 += 8;
164 pix2 += 8;
165 }
166 pix1 += line_size - 16;
167 pix2 += line_size - 16;
168 }
169 return s;
170}
171
eb4b3dd3 172static void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
de6d9b64 173{
de6d9b64
FB
174 int i;
175
176 /* read the pixels */
de6d9b64 177 for(i=0;i<8;i++) {
c13e1abd
FH
178 block[0] = pixels[0];
179 block[1] = pixels[1];
180 block[2] = pixels[2];
181 block[3] = pixels[3];
182 block[4] = pixels[4];
183 block[5] = pixels[5];
184 block[6] = pixels[6];
185 block[7] = pixels[7];
186 pixels += line_size;
187 block += 8;
de6d9b64
FB
188 }
189}
190
eb4b3dd3
ZK
191static void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1,
192 const UINT8 *s2, int stride){
9dbcbd92
MN
193 int i;
194
195 /* read the pixels */
9dbcbd92 196 for(i=0;i<8;i++) {
c13e1abd
FH
197 block[0] = s1[0] - s2[0];
198 block[1] = s1[1] - s2[1];
199 block[2] = s1[2] - s2[2];
200 block[3] = s1[3] - s2[3];
201 block[4] = s1[4] - s2[4];
202 block[5] = s1[5] - s2[5];
203 block[6] = s1[6] - s2[6];
204 block[7] = s1[7] - s2[7];
9dbcbd92
MN
205 s1 += stride;
206 s2 += stride;
c13e1abd 207 block += 8;
9dbcbd92
MN
208 }
209}
210
211
eb4b3dd3
ZK
212static void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
213 int line_size)
de6d9b64 214{
de6d9b64
FB
215 int i;
216 UINT8 *cm = cropTbl + MAX_NEG_CROP;
217
218 /* read the pixels */
de6d9b64 219 for(i=0;i<8;i++) {
c13e1abd
FH
220 pixels[0] = cm[block[0]];
221 pixels[1] = cm[block[1]];
222 pixels[2] = cm[block[2]];
223 pixels[3] = cm[block[3]];
224 pixels[4] = cm[block[4]];
225 pixels[5] = cm[block[5]];
226 pixels[6] = cm[block[6]];
227 pixels[7] = cm[block[7]];
228
229 pixels += line_size;
230 block += 8;
de6d9b64
FB
231 }
232}
233
eb4b3dd3 234static void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
c13e1abd 235 int line_size)
de6d9b64 236{
de6d9b64
FB
237 int i;
238 UINT8 *cm = cropTbl + MAX_NEG_CROP;
239
240 /* read the pixels */
de6d9b64 241 for(i=0;i<8;i++) {
c13e1abd
FH
242 pixels[0] = cm[pixels[0] + block[0]];
243 pixels[1] = cm[pixels[1] + block[1]];
244 pixels[2] = cm[pixels[2] + block[2]];
245 pixels[3] = cm[pixels[3] + block[3]];
246 pixels[4] = cm[pixels[4] + block[4]];
247 pixels[5] = cm[pixels[5] + block[5]];
248 pixels[6] = cm[pixels[6] + block[6]];
249 pixels[7] = cm[pixels[7] + block[7]];
250 pixels += line_size;
251 block += 8;
de6d9b64
FB
252 }
253}
59fe111e
MN
254#if 0
255
256#define PIXOP2(OPNAME, OP) \
b3184779 257static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
258{\
259 int i;\
260 for(i=0; i<h; i++){\
261 OP(*((uint64_t*)block), LD64(pixels));\
262 pixels+=line_size;\
263 block +=line_size;\
264 }\
265}\
266\
45553457 267static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
268{\
269 int i;\
270 for(i=0; i<h; i++){\
271 const uint64_t a= LD64(pixels );\
272 const uint64_t b= LD64(pixels+1);\
273 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
274 pixels+=line_size;\
275 block +=line_size;\
276 }\
277}\
278\
45553457 279static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
280{\
281 int i;\
282 for(i=0; i<h; i++){\
283 const uint64_t a= LD64(pixels );\
284 const uint64_t b= LD64(pixels+1);\
285 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
286 pixels+=line_size;\
287 block +=line_size;\
288 }\
289}\
290\
45553457 291static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
292{\
293 int i;\
294 for(i=0; i<h; i++){\
295 const uint64_t a= LD64(pixels );\
296 const uint64_t b= LD64(pixels+line_size);\
297 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
298 pixels+=line_size;\
299 block +=line_size;\
300 }\
301}\
302\
45553457 303static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
304{\
305 int i;\
306 for(i=0; i<h; i++){\
307 const uint64_t a= LD64(pixels );\
308 const uint64_t b= LD64(pixels+line_size);\
309 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
310 pixels+=line_size;\
311 block +=line_size;\
312 }\
313}\
314\
45553457 315static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
316{\
317 int i;\
318 const uint64_t a= LD64(pixels );\
319 const uint64_t b= LD64(pixels+1);\
320 uint64_t l0= (a&0x0303030303030303ULL)\
321 + (b&0x0303030303030303ULL)\
322 + 0x0202020202020202ULL;\
323 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
324 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
325 uint64_t l1,h1;\
326\
327 pixels+=line_size;\
328 for(i=0; i<h; i+=2){\
329 uint64_t a= LD64(pixels );\
330 uint64_t b= LD64(pixels+1);\
331 l1= (a&0x0303030303030303ULL)\
332 + (b&0x0303030303030303ULL);\
333 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
334 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
335 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
336 pixels+=line_size;\
337 block +=line_size;\
338 a= LD64(pixels );\
339 b= LD64(pixels+1);\
340 l0= (a&0x0303030303030303ULL)\
341 + (b&0x0303030303030303ULL)\
342 + 0x0202020202020202ULL;\
343 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
344 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
345 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
346 pixels+=line_size;\
347 block +=line_size;\
348 }\
349}\
350\
45553457 351static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
352{\
353 int i;\
354 const uint64_t a= LD64(pixels );\
355 const uint64_t b= LD64(pixels+1);\
356 uint64_t l0= (a&0x0303030303030303ULL)\
357 + (b&0x0303030303030303ULL)\
358 + 0x0101010101010101ULL;\
359 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
360 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
361 uint64_t l1,h1;\
362\
363 pixels+=line_size;\
364 for(i=0; i<h; i+=2){\
365 uint64_t a= LD64(pixels );\
366 uint64_t b= LD64(pixels+1);\
367 l1= (a&0x0303030303030303ULL)\
368 + (b&0x0303030303030303ULL);\
369 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
370 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
371 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
372 pixels+=line_size;\
373 block +=line_size;\
374 a= LD64(pixels );\
375 b= LD64(pixels+1);\
376 l0= (a&0x0303030303030303ULL)\
377 + (b&0x0303030303030303ULL)\
378 + 0x0101010101010101ULL;\
379 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
380 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
381 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
382 pixels+=line_size;\
383 block +=line_size;\
384 }\
385}\
386\
45553457
ZK
387CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
388CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
389CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
390CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
391CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
392CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
393CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
394
395#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
396#else // 64 bit variant
397
398#define PIXOP2(OPNAME, OP) \
45553457 399static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
400 int i;\
401 for(i=0; i<h; i++){\
402 OP(*((uint32_t*)(block )), LD32(pixels ));\
403 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
404 pixels+=line_size;\
405 block +=line_size;\
406 }\
407}\
45553457
ZK
408static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
409 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 410}\
59fe111e 411\
b3184779
MN
412static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
413 int src_stride1, int src_stride2, int h){\
59fe111e
MN
414 int i;\
415 for(i=0; i<h; i++){\
b3184779
MN
416 uint32_t a,b;\
417 a= LD32(&src1[i*src_stride1 ]);\
418 b= LD32(&src2[i*src_stride2 ]);\
419 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
420 a= LD32(&src1[i*src_stride1+4]);\
421 b= LD32(&src2[i*src_stride2+4]);\
422 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
423 }\
424}\
425\
b3184779
MN
426static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
427 int src_stride1, int src_stride2, int h){\
59fe111e
MN
428 int i;\
429 for(i=0; i<h; i++){\
b3184779
MN
430 uint32_t a,b;\
431 a= LD32(&src1[i*src_stride1 ]);\
432 b= LD32(&src2[i*src_stride2 ]);\
433 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
434 a= LD32(&src1[i*src_stride1+4]);\
435 b= LD32(&src2[i*src_stride2+4]);\
436 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
437 }\
438}\
439\
b3184779
MN
440static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
441 int src_stride1, int src_stride2, int h){\
442 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
443 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
444}\
445\
446static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
447 int src_stride1, int src_stride2, int h){\
448 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
449 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
450}\
451\
45553457 452static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
453 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
454}\
455\
45553457 456static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
457 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
458}\
459\
45553457 460static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
461 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
462}\
463\
45553457 464static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
465 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
466}\
467\
468static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
469 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
470 int i;\
471 for(i=0; i<h; i++){\
b3184779
MN
472 uint32_t a, b, c, d, l0, l1, h0, h1;\
473 a= LD32(&src1[i*src_stride1]);\
474 b= LD32(&src2[i*src_stride2]);\
475 c= LD32(&src3[i*src_stride3]);\
476 d= LD32(&src4[i*src_stride4]);\
477 l0= (a&0x03030303UL)\
478 + (b&0x03030303UL)\
479 + 0x02020202UL;\
480 h0= ((a&0xFCFCFCFCUL)>>2)\
481 + ((b&0xFCFCFCFCUL)>>2);\
482 l1= (c&0x03030303UL)\
483 + (d&0x03030303UL);\
484 h1= ((c&0xFCFCFCFCUL)>>2)\
485 + ((d&0xFCFCFCFCUL)>>2);\
486 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
487 a= LD32(&src1[i*src_stride1+4]);\
488 b= LD32(&src2[i*src_stride2+4]);\
489 c= LD32(&src3[i*src_stride3+4]);\
490 d= LD32(&src4[i*src_stride4+4]);\
491 l0= (a&0x03030303UL)\
492 + (b&0x03030303UL)\
493 + 0x02020202UL;\
494 h0= ((a&0xFCFCFCFCUL)>>2)\
495 + ((b&0xFCFCFCFCUL)>>2);\
496 l1= (c&0x03030303UL)\
497 + (d&0x03030303UL);\
498 h1= ((c&0xFCFCFCFCUL)>>2)\
499 + ((d&0xFCFCFCFCUL)>>2);\
500 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
501 }\
502}\
b3184779
MN
503static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
504 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
505 int i;\
506 for(i=0; i<h; i++){\
b3184779
MN
507 uint32_t a, b, c, d, l0, l1, h0, h1;\
508 a= LD32(&src1[i*src_stride1]);\
509 b= LD32(&src2[i*src_stride2]);\
510 c= LD32(&src3[i*src_stride3]);\
511 d= LD32(&src4[i*src_stride4]);\
512 l0= (a&0x03030303UL)\
513 + (b&0x03030303UL)\
514 + 0x01010101UL;\
515 h0= ((a&0xFCFCFCFCUL)>>2)\
516 + ((b&0xFCFCFCFCUL)>>2);\
517 l1= (c&0x03030303UL)\
518 + (d&0x03030303UL);\
519 h1= ((c&0xFCFCFCFCUL)>>2)\
520 + ((d&0xFCFCFCFCUL)>>2);\
521 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
522 a= LD32(&src1[i*src_stride1+4]);\
523 b= LD32(&src2[i*src_stride2+4]);\
524 c= LD32(&src3[i*src_stride3+4]);\
525 d= LD32(&src4[i*src_stride4+4]);\
526 l0= (a&0x03030303UL)\
527 + (b&0x03030303UL)\
528 + 0x01010101UL;\
529 h0= ((a&0xFCFCFCFCUL)>>2)\
530 + ((b&0xFCFCFCFCUL)>>2);\
531 l1= (c&0x03030303UL)\
532 + (d&0x03030303UL);\
533 h1= ((c&0xFCFCFCFCUL)>>2)\
534 + ((d&0xFCFCFCFCUL)>>2);\
535 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
536 }\
537}\
b3184779
MN
538static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
539 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
540 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
541 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
542}\
543static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
544 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
545 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
546 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
547}\
59fe111e 548\
45553457 549static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
550{\
551 int j;\
552 for(j=0; j<2; j++){\
553 int i;\
554 const uint32_t a= LD32(pixels );\
555 const uint32_t b= LD32(pixels+1);\
556 uint32_t l0= (a&0x03030303UL)\
557 + (b&0x03030303UL)\
558 + 0x02020202UL;\
559 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
560 + ((b&0xFCFCFCFCUL)>>2);\
561 uint32_t l1,h1;\
562\
563 pixels+=line_size;\
564 for(i=0; i<h; i+=2){\
565 uint32_t a= LD32(pixels );\
566 uint32_t b= LD32(pixels+1);\
567 l1= (a&0x03030303UL)\
568 + (b&0x03030303UL);\
569 h1= ((a&0xFCFCFCFCUL)>>2)\
570 + ((b&0xFCFCFCFCUL)>>2);\
571 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
572 pixels+=line_size;\
573 block +=line_size;\
574 a= LD32(pixels );\
575 b= LD32(pixels+1);\
576 l0= (a&0x03030303UL)\
577 + (b&0x03030303UL)\
578 + 0x02020202UL;\
579 h0= ((a&0xFCFCFCFCUL)>>2)\
580 + ((b&0xFCFCFCFCUL)>>2);\
581 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
582 pixels+=line_size;\
583 block +=line_size;\
584 }\
585 pixels+=4-line_size*(h+1);\
586 block +=4-line_size*h;\
587 }\
588}\
589\
45553457 590static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
591{\
592 int j;\
593 for(j=0; j<2; j++){\
594 int i;\
595 const uint32_t a= LD32(pixels );\
596 const uint32_t b= LD32(pixels+1);\
597 uint32_t l0= (a&0x03030303UL)\
598 + (b&0x03030303UL)\
599 + 0x01010101UL;\
600 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
601 + ((b&0xFCFCFCFCUL)>>2);\
602 uint32_t l1,h1;\
603\
604 pixels+=line_size;\
605 for(i=0; i<h; i+=2){\
606 uint32_t a= LD32(pixels );\
607 uint32_t b= LD32(pixels+1);\
608 l1= (a&0x03030303UL)\
609 + (b&0x03030303UL);\
610 h1= ((a&0xFCFCFCFCUL)>>2)\
611 + ((b&0xFCFCFCFCUL)>>2);\
612 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
613 pixels+=line_size;\
614 block +=line_size;\
615 a= LD32(pixels );\
616 b= LD32(pixels+1);\
617 l0= (a&0x03030303UL)\
618 + (b&0x03030303UL)\
619 + 0x01010101UL;\
620 h0= ((a&0xFCFCFCFCUL)>>2)\
621 + ((b&0xFCFCFCFCUL)>>2);\
622 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
623 pixels+=line_size;\
624 block +=line_size;\
625 }\
626 pixels+=4-line_size*(h+1);\
627 block +=4-line_size*h;\
628 }\
629}\
630\
45553457
ZK
631CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
632CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
633CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
634CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
635CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
636CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
637CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
638CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 639
59fe111e
MN
640#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
641#endif
59fe111e
MN
642#define op_put(a, b) a = b
643
644PIXOP2(avg, op_avg)
645PIXOP2(put, op_put)
646#undef op_avg
647#undef op_put
648
de6d9b64
FB
649#define avg2(a,b) ((a+b+1)>>1)
650#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
651
073b013d 652
b3184779 653static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
654{
655 const int A=(16-x16)*(16-y16);
656 const int B=( x16)*(16-y16);
657 const int C=(16-x16)*( y16);
658 const int D=( x16)*( y16);
659 int i;
44eb4951
MN
660
661 for(i=0; i<h; i++)
662 {
b3184779
MN
663 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
664 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
665 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
666 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
667 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
668 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
669 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
670 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
671 dst+= stride;
672 src+= stride;
44eb4951
MN
673 }
674}
675
073b013d
MN
676static void gmc_c(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy,
677 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
678{
679 int y, vx, vy;
680 const int s= 1<<shift;
681
682 width--;
683 height--;
684
685 for(y=0; y<h; y++){
686 int x;
687
688 vx= ox;
689 vy= oy;
690 for(x=0; x<8; x++){ //XXX FIXME optimize
691 int src_x, src_y, frac_x, frac_y, index;
692
693 src_x= vx>>16;
694 src_y= vy>>16;
695 frac_x= src_x&(s-1);
696 frac_y= src_y&(s-1);
697 src_x>>=shift;
698 src_y>>=shift;
699
700 if((unsigned)src_x < width){
701 if((unsigned)src_y < height){
702 index= src_x + src_y*stride;
703 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
704 + src[index +1]* frac_x )*(s-frac_y)
705 + ( src[index+stride ]*(s-frac_x)
706 + src[index+stride+1]* frac_x )* frac_y
707 + r)>>(shift*2);
708 }else{
709 index= src_x + clip(src_y, 0, height)*stride;
710 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
711 + src[index +1]* frac_x )*s
712 + r)>>(shift*2);
713 }
714 }else{
715 if((unsigned)src_y < height){
716 index= clip(src_x, 0, width) + src_y*stride;
717 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
718 + src[index+stride ]* frac_y )*s
719 + r)>>(shift*2);
720 }else{
721 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
722 dst[y*stride + x]= src[index ];
723 }
724 }
725
726 vx+= dxx;
727 vy+= dyx;
728 }
729 ox += dxy;
730 oy += dyy;
731 }
732}
733
b3184779 734static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951 735{
44eb4951
MN
736 int i;
737 for(i=0; i<h; i++)
738 {
b3184779
MN
739 ST32(dst , LD32(src ));
740 ST32(dst+4 , LD32(src+4 ));
741 ST32(dst+8 , LD32(src+8 ));
742 ST32(dst+12, LD32(src+12));
743 dst[16]= src[16];
44eb4951
MN
744 dst+=dstStride;
745 src+=srcStride;
746 }
747}
748
b3184779 749static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951
MN
750{
751 int i;
b3184779 752 for(i=0; i<h; i++)
44eb4951 753 {
b3184779
MN
754 ST32(dst , LD32(src ));
755 ST32(dst+4 , LD32(src+4 ));
756 dst[8]= src[8];
44eb4951
MN
757 dst+=dstStride;
758 src+=srcStride;
759 }
760}
761
b3184779
MN
762#define QPEL_MC(r, OPNAME, RND, OP) \
763static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
764 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
765 int i;\
766 for(i=0; i<h; i++)\
767 {\
768 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
769 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
770 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
771 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
772 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
773 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
774 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
775 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
776 dst+=dstStride;\
777 src+=srcStride;\
778 }\
44eb4951
MN
779}\
780\
b3184779
MN
781static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
782 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
783 int i;\
784 for(i=0; i<w; i++)\
785 {\
786 const int src0= src[0*srcStride];\
787 const int src1= src[1*srcStride];\
788 const int src2= src[2*srcStride];\
789 const int src3= src[3*srcStride];\
790 const int src4= src[4*srcStride];\
791 const int src5= src[5*srcStride];\
792 const int src6= src[6*srcStride];\
793 const int src7= src[7*srcStride];\
794 const int src8= src[8*srcStride];\
795 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
796 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
797 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
798 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
799 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
800 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
801 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
802 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
803 dst++;\
804 src++;\
805 }\
806}\
807\
808static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
809 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
810 int i;\
811 for(i=0; i<h; i++)\
812 {\
813 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
814 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
815 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
816 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
817 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
818 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
819 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
820 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
821 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
822 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
823 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
824 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
825 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
826 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
827 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
828 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
829 dst+=dstStride;\
830 src+=srcStride;\
831 }\
832}\
833\
834static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
835 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
836 int i;\
837 for(i=0; i<w; i++)\
838 {\
839 const int src0= src[0*srcStride];\
840 const int src1= src[1*srcStride];\
841 const int src2= src[2*srcStride];\
842 const int src3= src[3*srcStride];\
843 const int src4= src[4*srcStride];\
844 const int src5= src[5*srcStride];\
845 const int src6= src[6*srcStride];\
846 const int src7= src[7*srcStride];\
847 const int src8= src[8*srcStride];\
848 const int src9= src[9*srcStride];\
849 const int src10= src[10*srcStride];\
850 const int src11= src[11*srcStride];\
851 const int src12= src[12*srcStride];\
852 const int src13= src[13*srcStride];\
853 const int src14= src[14*srcStride];\
854 const int src15= src[15*srcStride];\
855 const int src16= src[16*srcStride];\
856 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
857 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
858 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
859 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
860 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
861 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
862 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
863 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
864 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
865 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
866 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
867 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
868 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
869 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
870 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
871 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
872 dst++;\
873 src++;\
874 }\
875}\
876\
877static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
45553457 878 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
879}\
880\
881static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 882 UINT8 half[64];\
b3184779
MN
883 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
884 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
885}\
886\
b3184779
MN
887static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
888 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
889}\
890\
b3184779 891static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 892 UINT8 half[64];\
b3184779
MN
893 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
894 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
895}\
896\
b3184779
MN
897static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
898 UINT8 full[16*9];\
44eb4951 899 UINT8 half[64];\
b3184779
MN
900 copy_block9(full, src, 16, stride, 9);\
901 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
902 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
903}\
904\
b3184779
MN
905static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
906 UINT8 full[16*9];\
907 copy_block9(full, src, 16, stride, 9);\
908 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
44eb4951
MN
909}\
910\
b3184779
MN
911static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
912 UINT8 full[16*9];\
44eb4951 913 UINT8 half[64];\
b3184779
MN
914 copy_block9(full, src, 16, stride, 9);\
915 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
916 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 917}\
b3184779
MN
918static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
919 UINT8 full[16*9];\
44eb4951 920 UINT8 halfH[72];\
7ff037e9 921 UINT8 halfV[64];\
44eb4951 922 UINT8 halfHV[64];\
b3184779
MN
923 copy_block9(full, src, 16, stride, 9);\
924 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
925 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
926 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
927 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 928}\
b3184779
MN
929static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
930 UINT8 full[16*9];\
44eb4951 931 UINT8 halfH[72];\
7ff037e9 932 UINT8 halfV[64];\
44eb4951 933 UINT8 halfHV[64];\
b3184779
MN
934 copy_block9(full, src, 16, stride, 9);\
935 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
936 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
937 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
938 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 939}\
b3184779
MN
940static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
941 UINT8 full[16*9];\
44eb4951 942 UINT8 halfH[72];\
7ff037e9 943 UINT8 halfV[64];\
44eb4951 944 UINT8 halfHV[64];\
b3184779
MN
945 copy_block9(full, src, 16, stride, 9);\
946 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
947 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
948 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
949 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 950}\
b3184779
MN
951static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
952 UINT8 full[16*9];\
44eb4951 953 UINT8 halfH[72];\
7ff037e9 954 UINT8 halfV[64];\
44eb4951 955 UINT8 halfHV[64];\
b3184779
MN
956 copy_block9(full, src, 16, stride, 9);\
957 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
958 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
959 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
960 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 961}\
b3184779 962static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
963 UINT8 halfH[72];\
964 UINT8 halfHV[64];\
b3184779
MN
965 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
966 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
967 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 968}\
b3184779 969static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
970 UINT8 halfH[72];\
971 UINT8 halfHV[64];\
b3184779
MN
972 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
973 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
974 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 975}\
b3184779
MN
976static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
977 UINT8 full[16*9];\
44eb4951 978 UINT8 halfH[72];\
7ff037e9 979 UINT8 halfV[64];\
44eb4951 980 UINT8 halfHV[64];\
b3184779
MN
981 copy_block9(full, src, 16, stride, 9);\
982 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
983 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
984 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
985 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 986}\
b3184779
MN
987static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
988 UINT8 full[16*9];\
44eb4951 989 UINT8 halfH[72];\
7ff037e9 990 UINT8 halfV[64];\
44eb4951 991 UINT8 halfHV[64];\
b3184779
MN
992 copy_block9(full, src, 16, stride, 9);\
993 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
994 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
995 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
996 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 997}\
b3184779 998static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 999 UINT8 halfH[72];\
b3184779
MN
1000 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1001 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
1002}\
1003static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
45553457 1004 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1005}\
1006\
1007static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1008 UINT8 half[256];\
1009 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1010 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1011}\
1012\
1013static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1014 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1015}\
b3184779
MN
1016\
1017static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1018 UINT8 half[256];\
1019 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1020 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1021}\
1022\
1023static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1024 UINT8 full[24*17];\
1025 UINT8 half[256];\
1026 copy_block17(full, src, 24, stride, 17);\
1027 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1028 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1029}\
1030\
1031static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1032 UINT8 full[24*17];\
1033 copy_block17(full, src, 24, stride, 17);\
1034 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1035}\
1036\
1037static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1038 UINT8 full[24*17];\
1039 UINT8 half[256];\
1040 copy_block17(full, src, 24, stride, 17);\
1041 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1042 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1043}\
1044static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1045 UINT8 full[24*17];\
1046 UINT8 halfH[272];\
1047 UINT8 halfV[256];\
1048 UINT8 halfHV[256];\
1049 copy_block17(full, src, 24, stride, 17);\
1050 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1051 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1052 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1053 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1054}\
1055static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1056 UINT8 full[24*17];\
1057 UINT8 halfH[272];\
1058 UINT8 halfV[256];\
1059 UINT8 halfHV[256];\
1060 copy_block17(full, src, 24, stride, 17);\
1061 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1062 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1063 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1064 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1065}\
1066static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1067 UINT8 full[24*17];\
1068 UINT8 halfH[272];\
1069 UINT8 halfV[256];\
1070 UINT8 halfHV[256];\
1071 copy_block17(full, src, 24, stride, 17);\
1072 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1073 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1074 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1075 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1076}\
1077static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1078 UINT8 full[24*17];\
1079 UINT8 halfH[272];\
1080 UINT8 halfV[256];\
1081 UINT8 halfHV[256];\
1082 copy_block17(full, src, 24, stride, 17);\
1083 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1084 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1085 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1086 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1087}\
1088static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1089 UINT8 halfH[272];\
1090 UINT8 halfHV[256];\
1091 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1092 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1093 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1094}\
1095static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1096 UINT8 halfH[272];\
1097 UINT8 halfHV[256];\
1098 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1099 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1100 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1101}\
1102static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1103 UINT8 full[24*17];\
1104 UINT8 halfH[272];\
1105 UINT8 halfV[256];\
1106 UINT8 halfHV[256];\
1107 copy_block17(full, src, 24, stride, 17);\
1108 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1109 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1110 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1111 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1112}\
1113static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1114 UINT8 full[24*17];\
1115 UINT8 halfH[272];\
1116 UINT8 halfV[256];\
1117 UINT8 halfHV[256];\
1118 copy_block17(full, src, 24, stride, 17);\
1119 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1120 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1121 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1122 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1123}\
1124static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1125 UINT8 halfH[272];\
1126 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1127 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
45553457 1128}
44eb4951 1129
b3184779
MN
1130#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1131#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1132#define op_put(a, b) a = cm[((b) + 16)>>5]
1133#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1134
1135QPEL_MC(0, put_ , _ , op_put)
1136QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1137QPEL_MC(0, avg_ , _ , op_avg)
1138//QPEL_MC(1, avg_no_rnd , _ , op_avg)
1139#undef op_avg
1140#undef op_avg_no_rnd
1141#undef op_put
1142#undef op_put_no_rnd
44eb4951 1143
eb4b3dd3 1144static int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1145{
1146 int s, i;
1147
1148 s = 0;
ba6802de 1149 for(i=0;i<16;i++) {
de6d9b64
FB
1150 s += abs(pix1[0] - pix2[0]);
1151 s += abs(pix1[1] - pix2[1]);
1152 s += abs(pix1[2] - pix2[2]);
1153 s += abs(pix1[3] - pix2[3]);
1154 s += abs(pix1[4] - pix2[4]);
1155 s += abs(pix1[5] - pix2[5]);
1156 s += abs(pix1[6] - pix2[6]);
1157 s += abs(pix1[7] - pix2[7]);
1158 s += abs(pix1[8] - pix2[8]);
1159 s += abs(pix1[9] - pix2[9]);
1160 s += abs(pix1[10] - pix2[10]);
1161 s += abs(pix1[11] - pix2[11]);
1162 s += abs(pix1[12] - pix2[12]);
1163 s += abs(pix1[13] - pix2[13]);
1164 s += abs(pix1[14] - pix2[14]);
1165 s += abs(pix1[15] - pix2[15]);
1166 pix1 += line_size;
1167 pix2 += line_size;
1168 }
1169 return s;
1170}
1171
eb4b3dd3 1172static int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1173{
1174 int s, i;
1175
1176 s = 0;
ba6802de 1177 for(i=0;i<16;i++) {
de6d9b64
FB
1178 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1179 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1180 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1181 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1182 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1183 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1184 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1185 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1186 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1187 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1188 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1189 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1190 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1191 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1192 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1193 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1194 pix1 += line_size;
1195 pix2 += line_size;
1196 }
1197 return s;
1198}
1199
eb4b3dd3 1200static int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1201{
1202 int s, i;
1203 UINT8 *pix3 = pix2 + line_size;
1204
1205 s = 0;
ba6802de 1206 for(i=0;i<16;i++) {
de6d9b64
FB
1207 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1208 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1209 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1210 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1211 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1212 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1213 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1214 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1215 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1216 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1217 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1218 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1219 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1220 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1221 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1222 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1223 pix1 += line_size;
1224 pix2 += line_size;
1225 pix3 += line_size;
1226 }
1227 return s;
1228}
1229
eb4b3dd3 1230static int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1231{
1232 int s, i;
1233 UINT8 *pix3 = pix2 + line_size;
1234
1235 s = 0;
ba6802de 1236 for(i=0;i<16;i++) {
de6d9b64
FB
1237 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1238 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1239 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1240 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1241 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1242 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1243 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1244 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1245 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1246 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1247 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1248 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1249 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1250 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1251 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1252 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1253 pix1 += line_size;
1254 pix2 += line_size;
1255 pix3 += line_size;
1256 }
1257 return s;
1258}
1259
eb4b3dd3 1260static int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1261{
1262 int s, i;
1263
1264 s = 0;
1265 for(i=0;i<8;i++) {
1266 s += abs(pix1[0] - pix2[0]);
1267 s += abs(pix1[1] - pix2[1]);
1268 s += abs(pix1[2] - pix2[2]);
1269 s += abs(pix1[3] - pix2[3]);
1270 s += abs(pix1[4] - pix2[4]);
1271 s += abs(pix1[5] - pix2[5]);
1272 s += abs(pix1[6] - pix2[6]);
1273 s += abs(pix1[7] - pix2[7]);
1274 pix1 += line_size;
1275 pix2 += line_size;
1276 }
1277 return s;
1278}
1279
eb4b3dd3 1280static int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1281{
1282 int s, i;
1283
1284 s = 0;
1285 for(i=0;i<8;i++) {
1286 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1287 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1288 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1289 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1290 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1291 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1292 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1293 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1294 pix1 += line_size;
1295 pix2 += line_size;
1296 }
1297 return s;
1298}
1299
eb4b3dd3 1300static int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1301{
1302 int s, i;
1303 UINT8 *pix3 = pix2 + line_size;
1304
1305 s = 0;
1306 for(i=0;i<8;i++) {
1307 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1308 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1309 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1310 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1311 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1312 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1313 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1314 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1315 pix1 += line_size;
1316 pix2 += line_size;
1317 pix3 += line_size;
1318 }
1319 return s;
1320}
1321
eb4b3dd3 1322static int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1323{
1324 int s, i;
1325 UINT8 *pix3 = pix2 + line_size;
1326
1327 s = 0;
1328 for(i=0;i<8;i++) {
1329 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1330 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1331 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1332 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1333 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1334 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1335 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1336 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1337 pix1 += line_size;
1338 pix2 += line_size;
1339 pix3 += line_size;
1340 }
1341 return s;
1342}
1343
7801d21d 1344void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last)
d962f6fd 1345{
7801d21d
MN
1346 int i;
1347 INT16 temp[64];
1348
1349 if(last<=0) return;
9a7b310d 1350 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 1351
7801d21d
MN
1352 for(i=0; i<=last; i++){
1353 const int j= scantable[i];
1354 temp[j]= block[j];
1355 block[j]=0;
1356 }
1357
1358 for(i=0; i<=last; i++){
1359 const int j= scantable[i];
1360 const int perm_j= permutation[j];
1361 block[perm_j]= temp[j];
1362 }
d962f6fd 1363}
e0eac44e 1364
eb4b3dd3 1365static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
1366{
1367 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1368}
1369
11f18faf
MN
1370static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1371 int i;
1372 for(i=0; i+7<w; i++){
1373 dst[i+0] += src[i+0];
1374 dst[i+1] += src[i+1];
1375 dst[i+2] += src[i+2];
1376 dst[i+3] += src[i+3];
1377 dst[i+4] += src[i+4];
1378 dst[i+5] += src[i+5];
1379 dst[i+6] += src[i+6];
1380 dst[i+7] += src[i+7];
1381 }
1382 for(; i<w; i++)
1383 dst[i+0] += src[i+0];
1384}
1385
1386static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1387 int i;
1388 for(i=0; i+7<w; i++){
1389 dst[i+0] = src1[i+0]-src2[i+0];
1390 dst[i+1] = src1[i+1]-src2[i+1];
1391 dst[i+2] = src1[i+2]-src2[i+2];
1392 dst[i+3] = src1[i+3]-src2[i+3];
1393 dst[i+4] = src1[i+4]-src2[i+4];
1394 dst[i+5] = src1[i+5]-src2[i+5];
1395 dst[i+6] = src1[i+6]-src2[i+6];
1396 dst[i+7] = src1[i+7]-src2[i+7];
1397 }
1398 for(; i<w; i++)
1399 dst[i+0] = src1[i+0]-src2[i+0];
1400}
1401
eb4b3dd3 1402void dsputil_init(DSPContext* c, unsigned mask)
e0eac44e 1403{
5abd509a 1404 static int init_done = 0;
d2975f8d 1405 int i;
e0eac44e 1406
5abd509a
ZK
1407 if (!init_done) {
1408 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1409 for(i=0;i<MAX_NEG_CROP;i++) {
1410 cropTbl[i] = 0;
1411 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1412 }
de6d9b64 1413
5abd509a
ZK
1414 for(i=0;i<512;i++) {
1415 squareTbl[i] = (i - 256) * (i - 256);
1416 }
92ddb692
ZK
1417
1418 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
1419
1420 init_done = 1;
de6d9b64
FB
1421 }
1422
eb4b3dd3
ZK
1423 c->get_pixels = get_pixels_c;
1424 c->diff_pixels = diff_pixels_c;
1425 c->put_pixels_clamped = put_pixels_clamped_c;
1426 c->add_pixels_clamped = add_pixels_clamped_c;
1427 c->gmc1 = gmc1_c;
1428 c->gmc = gmc_c;
1429 c->clear_blocks = clear_blocks_c;
1430 c->pix_sum = pix_sum_c;
1431 c->pix_norm1 = pix_norm1_c;
9c76bd48 1432 c->pix_norm = pix_norm_c;
eb4b3dd3 1433
45553457 1434 /* TODO [0] 16 [1] 8 */
eb4b3dd3
ZK
1435 c->pix_abs16x16 = pix_abs16x16_c;
1436 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
1437 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
1438 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1439 c->pix_abs8x8 = pix_abs8x8_c;
1440 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
1441 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
1442 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1443
45553457
ZK
1444#define dspfunc(PFX, IDX, NUM) \
1445 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
1446 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
1447 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
1448 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
1449
1450 dspfunc(put, 0, 16);
1451 dspfunc(put_no_rnd, 0, 16);
1452 dspfunc(put, 1, 8);
1453 dspfunc(put_no_rnd, 1, 8);
1454
1455 dspfunc(avg, 0, 16);
1456 dspfunc(avg_no_rnd, 0, 16);
1457 dspfunc(avg, 1, 8);
1458 dspfunc(avg_no_rnd, 1, 8);
1459#undef dspfunc
1460
1461#define dspfunc(PFX, IDX, NUM) \
1462 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
1463 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
1464 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
1465 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
1466 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
1467 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
1468 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
1469 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
1470 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
1471 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
1472 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
1473 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
1474 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
1475 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
1476 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
1477 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
1478
1479 dspfunc(put_qpel, 0, 16);
1480 dspfunc(put_no_rnd_qpel, 0, 16);
1481
1482 dspfunc(avg_qpel, 0, 16);
1483 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
1484
1485 dspfunc(put_qpel, 1, 8);
1486 dspfunc(put_no_rnd_qpel, 1, 8);
1487
1488 dspfunc(avg_qpel, 1, 8);
1489 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
1490#undef dspfunc
c9a2ebc4 1491
11f18faf
MN
1492 c->add_bytes= add_bytes_c;
1493 c->diff_bytes= diff_bytes_c;
1494
980fc7b8 1495#ifdef HAVE_MMX
eb4b3dd3 1496 dsputil_init_mmx(c, mask);
34dfe896
ZK
1497 if (ff_bit_exact)
1498 {
1499 /* FIXME - AVCodec context should have flag for bitexact match */
1500 /* fprintf(stderr, "\n\n\nff_bit_exact %d\n\n\n\n", ff_bit_exact); */
1501 dsputil_set_bit_exact_mmx(c, mask);
1502 }
de6d9b64 1503#endif
3d03c0a2 1504#ifdef ARCH_ARMV4L
eb4b3dd3 1505 dsputil_init_armv4l(c, mask);
3d03c0a2 1506#endif
c34270f5 1507#ifdef HAVE_MLIB
eb4b3dd3 1508 dsputil_init_mlib(c, mask);
c34270f5 1509#endif
1e98dffb 1510#ifdef ARCH_ALPHA
eb4b3dd3 1511 dsputil_init_alpha(c, mask);
1e98dffb 1512#endif
59925ef2 1513#ifdef ARCH_POWERPC
eb4b3dd3 1514 dsputil_init_ppc(c, mask);
a43bd1d7 1515#endif
d46aba26 1516#ifdef HAVE_MMI
eb4b3dd3 1517 dsputil_init_mmi(c, mask);
d46aba26 1518#endif
c34270f5 1519
de6d9b64 1520}
43f1708f 1521
57060b1e
FB
1522/* remove any non bit exact operation (testing purpose) */
1523void avcodec_set_bit_exact(void)
1524{
5596c60c 1525 ff_bit_exact=1;
57060b1e 1526#ifdef HAVE_MMX
34dfe896 1527// FIXME - better set_bit_exact
eb4b3dd3 1528// dsputil_set_bit_exact_mmx();
57060b1e
FB
1529#endif
1530}