mpeg4 header encoding bugfix
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
1457ab52 23#include "mpegvideo.h"
45553457 24
5596c60c
MN
25int ff_bit_exact=0;
26
0cfa9713 27UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
28UINT32 squareTbl[512];
29
2ad1516a
MN
30const UINT8 ff_zigzag_direct[64] = {
31 0, 1, 8, 16, 9, 2, 3, 10,
32 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 33 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 34 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
35 35, 42, 49, 56, 57, 50, 43, 36,
36 29, 22, 15, 23, 30, 37, 44, 51,
37 58, 59, 52, 45, 38, 31, 39, 46,
38 53, 60, 61, 54, 47, 55, 62, 63
39};
40
2f349de2
MN
41/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
42UINT16 __align8 inv_zigzag_direct16[64];
43
2ad1516a
MN
44const UINT8 ff_alternate_horizontal_scan[64] = {
45 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
46 10, 11, 4, 5, 6, 7, 15, 14,
47 13, 12, 19, 18, 24, 25, 32, 33,
48 26, 27, 20, 21, 22, 23, 28, 29,
49 30, 31, 34, 35, 40, 41, 48, 49,
50 42, 43, 36, 37, 38, 39, 44, 45,
51 46, 47, 50, 51, 56, 57, 58, 59,
52 52, 53, 54, 55, 60, 61, 62, 63,
53};
54
2ad1516a
MN
55const UINT8 ff_alternate_vertical_scan[64] = {
56 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
57 17, 25, 32, 40, 48, 56, 57, 49,
58 41, 33, 26, 18, 3, 11, 4, 12,
59 19, 27, 34, 42, 50, 58, 35, 43,
60 51, 59, 20, 28, 5, 13, 6, 14,
61 21, 29, 36, 44, 52, 60, 37, 45,
62 53, 61, 22, 30, 7, 15, 23, 31,
63 38, 46, 54, 62, 39, 47, 55, 63,
64};
65
2f349de2 66/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
eb4b3dd3 67const UINT32 inverse[256]={
2f349de2
MN
68 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
69 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
70 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
71 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
72 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
73 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
74 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
75 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
76 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
77 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
78 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
79 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
80 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
81 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
82 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
83 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
84 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
85 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
86 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
87 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
88 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
89 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
90 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
91 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
92 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
93 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
94 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
95 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
96 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
97 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
98 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
99 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
100};
101
eb4b3dd3 102static int pix_sum_c(UINT8 * pix, int line_size)
3aa102be
MN
103{
104 int s, i, j;
105
106 s = 0;
107 for (i = 0; i < 16; i++) {
108 for (j = 0; j < 16; j += 8) {
109 s += pix[0];
110 s += pix[1];
111 s += pix[2];
112 s += pix[3];
113 s += pix[4];
114 s += pix[5];
115 s += pix[6];
116 s += pix[7];
117 pix += 8;
118 }
119 pix += line_size - 16;
120 }
121 return s;
122}
123
eb4b3dd3 124static int pix_norm1_c(UINT8 * pix, int line_size)
3aa102be
MN
125{
126 int s, i, j;
127 UINT32 *sq = squareTbl + 256;
128
129 s = 0;
130 for (i = 0; i < 16; i++) {
131 for (j = 0; j < 16; j += 8) {
132 s += sq[pix[0]];
133 s += sq[pix[1]];
134 s += sq[pix[2]];
135 s += sq[pix[3]];
136 s += sq[pix[4]];
137 s += sq[pix[5]];
138 s += sq[pix[6]];
139 s += sq[pix[7]];
140 pix += 8;
141 }
142 pix += line_size - 16;
143 }
144 return s;
145}
146
147
1457ab52
MN
148static int sse8_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
149{
150 int s, i;
151 UINT32 *sq = squareTbl + 256;
152
153 s = 0;
154 for (i = 0; i < 8; i++) {
155 s += sq[pix1[0] - pix2[0]];
156 s += sq[pix1[1] - pix2[1]];
157 s += sq[pix1[2] - pix2[2]];
158 s += sq[pix1[3] - pix2[3]];
159 s += sq[pix1[4] - pix2[4]];
160 s += sq[pix1[5] - pix2[5]];
161 s += sq[pix1[6] - pix2[6]];
162 s += sq[pix1[7] - pix2[7]];
163 pix1 += line_size;
164 pix2 += line_size;
165 }
166 return s;
167}
168
169static int sse16_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
9c76bd48
BF
170{
171 int s, i, j;
172 UINT32 *sq = squareTbl + 256;
173
174 s = 0;
175 for (i = 0; i < 16; i++) {
176 for (j = 0; j < 16; j += 8) {
177 s += sq[pix1[0] - pix2[0]];
178 s += sq[pix1[1] - pix2[1]];
179 s += sq[pix1[2] - pix2[2]];
180 s += sq[pix1[3] - pix2[3]];
181 s += sq[pix1[4] - pix2[4]];
182 s += sq[pix1[5] - pix2[5]];
183 s += sq[pix1[6] - pix2[6]];
184 s += sq[pix1[7] - pix2[7]];
185 pix1 += 8;
186 pix2 += 8;
187 }
188 pix1 += line_size - 16;
189 pix2 += line_size - 16;
190 }
191 return s;
192}
193
eb4b3dd3 194static void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
de6d9b64 195{
de6d9b64
FB
196 int i;
197
198 /* read the pixels */
de6d9b64 199 for(i=0;i<8;i++) {
c13e1abd
FH
200 block[0] = pixels[0];
201 block[1] = pixels[1];
202 block[2] = pixels[2];
203 block[3] = pixels[3];
204 block[4] = pixels[4];
205 block[5] = pixels[5];
206 block[6] = pixels[6];
207 block[7] = pixels[7];
208 pixels += line_size;
209 block += 8;
de6d9b64
FB
210 }
211}
212
eb4b3dd3
ZK
213static void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1,
214 const UINT8 *s2, int stride){
9dbcbd92
MN
215 int i;
216
217 /* read the pixels */
9dbcbd92 218 for(i=0;i<8;i++) {
c13e1abd
FH
219 block[0] = s1[0] - s2[0];
220 block[1] = s1[1] - s2[1];
221 block[2] = s1[2] - s2[2];
222 block[3] = s1[3] - s2[3];
223 block[4] = s1[4] - s2[4];
224 block[5] = s1[5] - s2[5];
225 block[6] = s1[6] - s2[6];
226 block[7] = s1[7] - s2[7];
9dbcbd92
MN
227 s1 += stride;
228 s2 += stride;
c13e1abd 229 block += 8;
9dbcbd92
MN
230 }
231}
232
233
eb4b3dd3
ZK
234static void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
235 int line_size)
de6d9b64 236{
de6d9b64
FB
237 int i;
238 UINT8 *cm = cropTbl + MAX_NEG_CROP;
239
240 /* read the pixels */
de6d9b64 241 for(i=0;i<8;i++) {
c13e1abd
FH
242 pixels[0] = cm[block[0]];
243 pixels[1] = cm[block[1]];
244 pixels[2] = cm[block[2]];
245 pixels[3] = cm[block[3]];
246 pixels[4] = cm[block[4]];
247 pixels[5] = cm[block[5]];
248 pixels[6] = cm[block[6]];
249 pixels[7] = cm[block[7]];
250
251 pixels += line_size;
252 block += 8;
de6d9b64
FB
253 }
254}
255
eb4b3dd3 256static void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
c13e1abd 257 int line_size)
de6d9b64 258{
de6d9b64
FB
259 int i;
260 UINT8 *cm = cropTbl + MAX_NEG_CROP;
261
262 /* read the pixels */
de6d9b64 263 for(i=0;i<8;i++) {
c13e1abd
FH
264 pixels[0] = cm[pixels[0] + block[0]];
265 pixels[1] = cm[pixels[1] + block[1]];
266 pixels[2] = cm[pixels[2] + block[2]];
267 pixels[3] = cm[pixels[3] + block[3]];
268 pixels[4] = cm[pixels[4] + block[4]];
269 pixels[5] = cm[pixels[5] + block[5]];
270 pixels[6] = cm[pixels[6] + block[6]];
271 pixels[7] = cm[pixels[7] + block[7]];
272 pixels += line_size;
273 block += 8;
de6d9b64
FB
274 }
275}
59fe111e
MN
276#if 0
277
278#define PIXOP2(OPNAME, OP) \
b3184779 279static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
280{\
281 int i;\
282 for(i=0; i<h; i++){\
283 OP(*((uint64_t*)block), LD64(pixels));\
284 pixels+=line_size;\
285 block +=line_size;\
286 }\
287}\
288\
45553457 289static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
290{\
291 int i;\
292 for(i=0; i<h; i++){\
293 const uint64_t a= LD64(pixels );\
294 const uint64_t b= LD64(pixels+1);\
295 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
296 pixels+=line_size;\
297 block +=line_size;\
298 }\
299}\
300\
45553457 301static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
302{\
303 int i;\
304 for(i=0; i<h; i++){\
305 const uint64_t a= LD64(pixels );\
306 const uint64_t b= LD64(pixels+1);\
307 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
308 pixels+=line_size;\
309 block +=line_size;\
310 }\
311}\
312\
45553457 313static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
314{\
315 int i;\
316 for(i=0; i<h; i++){\
317 const uint64_t a= LD64(pixels );\
318 const uint64_t b= LD64(pixels+line_size);\
319 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
320 pixels+=line_size;\
321 block +=line_size;\
322 }\
323}\
324\
45553457 325static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
326{\
327 int i;\
328 for(i=0; i<h; i++){\
329 const uint64_t a= LD64(pixels );\
330 const uint64_t b= LD64(pixels+line_size);\
331 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
332 pixels+=line_size;\
333 block +=line_size;\
334 }\
335}\
336\
45553457 337static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
338{\
339 int i;\
340 const uint64_t a= LD64(pixels );\
341 const uint64_t b= LD64(pixels+1);\
342 uint64_t l0= (a&0x0303030303030303ULL)\
343 + (b&0x0303030303030303ULL)\
344 + 0x0202020202020202ULL;\
345 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
346 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
347 uint64_t l1,h1;\
348\
349 pixels+=line_size;\
350 for(i=0; i<h; i+=2){\
351 uint64_t a= LD64(pixels );\
352 uint64_t b= LD64(pixels+1);\
353 l1= (a&0x0303030303030303ULL)\
354 + (b&0x0303030303030303ULL);\
355 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
356 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
357 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
358 pixels+=line_size;\
359 block +=line_size;\
360 a= LD64(pixels );\
361 b= LD64(pixels+1);\
362 l0= (a&0x0303030303030303ULL)\
363 + (b&0x0303030303030303ULL)\
364 + 0x0202020202020202ULL;\
365 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
366 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
367 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
368 pixels+=line_size;\
369 block +=line_size;\
370 }\
371}\
372\
45553457 373static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
374{\
375 int i;\
376 const uint64_t a= LD64(pixels );\
377 const uint64_t b= LD64(pixels+1);\
378 uint64_t l0= (a&0x0303030303030303ULL)\
379 + (b&0x0303030303030303ULL)\
380 + 0x0101010101010101ULL;\
381 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
382 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
383 uint64_t l1,h1;\
384\
385 pixels+=line_size;\
386 for(i=0; i<h; i+=2){\
387 uint64_t a= LD64(pixels );\
388 uint64_t b= LD64(pixels+1);\
389 l1= (a&0x0303030303030303ULL)\
390 + (b&0x0303030303030303ULL);\
391 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
392 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
393 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
394 pixels+=line_size;\
395 block +=line_size;\
396 a= LD64(pixels );\
397 b= LD64(pixels+1);\
398 l0= (a&0x0303030303030303ULL)\
399 + (b&0x0303030303030303ULL)\
400 + 0x0101010101010101ULL;\
401 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
402 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
403 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
404 pixels+=line_size;\
405 block +=line_size;\
406 }\
407}\
408\
45553457
ZK
409CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
410CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
411CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
412CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
413CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
414CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
415CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
416
417#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
418#else // 64 bit variant
419
420#define PIXOP2(OPNAME, OP) \
45553457 421static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
422 int i;\
423 for(i=0; i<h; i++){\
424 OP(*((uint32_t*)(block )), LD32(pixels ));\
425 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
426 pixels+=line_size;\
427 block +=line_size;\
428 }\
429}\
45553457
ZK
430static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
431 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 432}\
59fe111e 433\
b3184779
MN
434static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
435 int src_stride1, int src_stride2, int h){\
59fe111e
MN
436 int i;\
437 for(i=0; i<h; i++){\
b3184779
MN
438 uint32_t a,b;\
439 a= LD32(&src1[i*src_stride1 ]);\
440 b= LD32(&src2[i*src_stride2 ]);\
441 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
442 a= LD32(&src1[i*src_stride1+4]);\
443 b= LD32(&src2[i*src_stride2+4]);\
444 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
445 }\
446}\
447\
b3184779
MN
448static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
449 int src_stride1, int src_stride2, int h){\
59fe111e
MN
450 int i;\
451 for(i=0; i<h; i++){\
b3184779
MN
452 uint32_t a,b;\
453 a= LD32(&src1[i*src_stride1 ]);\
454 b= LD32(&src2[i*src_stride2 ]);\
455 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
456 a= LD32(&src1[i*src_stride1+4]);\
457 b= LD32(&src2[i*src_stride2+4]);\
458 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
459 }\
460}\
461\
b3184779
MN
462static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
463 int src_stride1, int src_stride2, int h){\
464 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
465 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
466}\
467\
468static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
469 int src_stride1, int src_stride2, int h){\
470 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
471 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
472}\
473\
45553457 474static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
475 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
476}\
477\
45553457 478static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
479 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
480}\
481\
45553457 482static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
483 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
484}\
485\
45553457 486static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
487 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
488}\
489\
490static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
491 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
492 int i;\
493 for(i=0; i<h; i++){\
b3184779
MN
494 uint32_t a, b, c, d, l0, l1, h0, h1;\
495 a= LD32(&src1[i*src_stride1]);\
496 b= LD32(&src2[i*src_stride2]);\
497 c= LD32(&src3[i*src_stride3]);\
498 d= LD32(&src4[i*src_stride4]);\
499 l0= (a&0x03030303UL)\
500 + (b&0x03030303UL)\
501 + 0x02020202UL;\
502 h0= ((a&0xFCFCFCFCUL)>>2)\
503 + ((b&0xFCFCFCFCUL)>>2);\
504 l1= (c&0x03030303UL)\
505 + (d&0x03030303UL);\
506 h1= ((c&0xFCFCFCFCUL)>>2)\
507 + ((d&0xFCFCFCFCUL)>>2);\
508 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
509 a= LD32(&src1[i*src_stride1+4]);\
510 b= LD32(&src2[i*src_stride2+4]);\
511 c= LD32(&src3[i*src_stride3+4]);\
512 d= LD32(&src4[i*src_stride4+4]);\
513 l0= (a&0x03030303UL)\
514 + (b&0x03030303UL)\
515 + 0x02020202UL;\
516 h0= ((a&0xFCFCFCFCUL)>>2)\
517 + ((b&0xFCFCFCFCUL)>>2);\
518 l1= (c&0x03030303UL)\
519 + (d&0x03030303UL);\
520 h1= ((c&0xFCFCFCFCUL)>>2)\
521 + ((d&0xFCFCFCFCUL)>>2);\
522 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
523 }\
524}\
b3184779
MN
525static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
526 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
527 int i;\
528 for(i=0; i<h; i++){\
b3184779
MN
529 uint32_t a, b, c, d, l0, l1, h0, h1;\
530 a= LD32(&src1[i*src_stride1]);\
531 b= LD32(&src2[i*src_stride2]);\
532 c= LD32(&src3[i*src_stride3]);\
533 d= LD32(&src4[i*src_stride4]);\
534 l0= (a&0x03030303UL)\
535 + (b&0x03030303UL)\
536 + 0x01010101UL;\
537 h0= ((a&0xFCFCFCFCUL)>>2)\
538 + ((b&0xFCFCFCFCUL)>>2);\
539 l1= (c&0x03030303UL)\
540 + (d&0x03030303UL);\
541 h1= ((c&0xFCFCFCFCUL)>>2)\
542 + ((d&0xFCFCFCFCUL)>>2);\
543 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
544 a= LD32(&src1[i*src_stride1+4]);\
545 b= LD32(&src2[i*src_stride2+4]);\
546 c= LD32(&src3[i*src_stride3+4]);\
547 d= LD32(&src4[i*src_stride4+4]);\
548 l0= (a&0x03030303UL)\
549 + (b&0x03030303UL)\
550 + 0x01010101UL;\
551 h0= ((a&0xFCFCFCFCUL)>>2)\
552 + ((b&0xFCFCFCFCUL)>>2);\
553 l1= (c&0x03030303UL)\
554 + (d&0x03030303UL);\
555 h1= ((c&0xFCFCFCFCUL)>>2)\
556 + ((d&0xFCFCFCFCUL)>>2);\
557 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
558 }\
559}\
b3184779
MN
560static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
561 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
562 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
563 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
564}\
565static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
566 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
567 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
568 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
569}\
59fe111e 570\
45553457 571static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
572{\
573 int j;\
574 for(j=0; j<2; j++){\
575 int i;\
576 const uint32_t a= LD32(pixels );\
577 const uint32_t b= LD32(pixels+1);\
578 uint32_t l0= (a&0x03030303UL)\
579 + (b&0x03030303UL)\
580 + 0x02020202UL;\
581 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
582 + ((b&0xFCFCFCFCUL)>>2);\
583 uint32_t l1,h1;\
584\
585 pixels+=line_size;\
586 for(i=0; i<h; i+=2){\
587 uint32_t a= LD32(pixels );\
588 uint32_t b= LD32(pixels+1);\
589 l1= (a&0x03030303UL)\
590 + (b&0x03030303UL);\
591 h1= ((a&0xFCFCFCFCUL)>>2)\
592 + ((b&0xFCFCFCFCUL)>>2);\
593 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
594 pixels+=line_size;\
595 block +=line_size;\
596 a= LD32(pixels );\
597 b= LD32(pixels+1);\
598 l0= (a&0x03030303UL)\
599 + (b&0x03030303UL)\
600 + 0x02020202UL;\
601 h0= ((a&0xFCFCFCFCUL)>>2)\
602 + ((b&0xFCFCFCFCUL)>>2);\
603 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
604 pixels+=line_size;\
605 block +=line_size;\
606 }\
607 pixels+=4-line_size*(h+1);\
608 block +=4-line_size*h;\
609 }\
610}\
611\
45553457 612static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
613{\
614 int j;\
615 for(j=0; j<2; j++){\
616 int i;\
617 const uint32_t a= LD32(pixels );\
618 const uint32_t b= LD32(pixels+1);\
619 uint32_t l0= (a&0x03030303UL)\
620 + (b&0x03030303UL)\
621 + 0x01010101UL;\
622 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
623 + ((b&0xFCFCFCFCUL)>>2);\
624 uint32_t l1,h1;\
625\
626 pixels+=line_size;\
627 for(i=0; i<h; i+=2){\
628 uint32_t a= LD32(pixels );\
629 uint32_t b= LD32(pixels+1);\
630 l1= (a&0x03030303UL)\
631 + (b&0x03030303UL);\
632 h1= ((a&0xFCFCFCFCUL)>>2)\
633 + ((b&0xFCFCFCFCUL)>>2);\
634 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
635 pixels+=line_size;\
636 block +=line_size;\
637 a= LD32(pixels );\
638 b= LD32(pixels+1);\
639 l0= (a&0x03030303UL)\
640 + (b&0x03030303UL)\
641 + 0x01010101UL;\
642 h0= ((a&0xFCFCFCFCUL)>>2)\
643 + ((b&0xFCFCFCFCUL)>>2);\
644 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
645 pixels+=line_size;\
646 block +=line_size;\
647 }\
648 pixels+=4-line_size*(h+1);\
649 block +=4-line_size*h;\
650 }\
651}\
652\
45553457
ZK
653CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
654CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
655CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
656CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
657CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
658CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
659CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
660CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 661
59fe111e
MN
662#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
663#endif
59fe111e
MN
664#define op_put(a, b) a = b
665
666PIXOP2(avg, op_avg)
667PIXOP2(put, op_put)
668#undef op_avg
669#undef op_put
670
de6d9b64
FB
671#define avg2(a,b) ((a+b+1)>>1)
672#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
673
073b013d 674
b3184779 675static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
676{
677 const int A=(16-x16)*(16-y16);
678 const int B=( x16)*(16-y16);
679 const int C=(16-x16)*( y16);
680 const int D=( x16)*( y16);
681 int i;
44eb4951
MN
682
683 for(i=0; i<h; i++)
684 {
b3184779
MN
685 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
686 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
687 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
688 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
689 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
690 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
691 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
692 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
693 dst+= stride;
694 src+= stride;
44eb4951
MN
695 }
696}
697
073b013d
MN
698static void gmc_c(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy,
699 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
700{
701 int y, vx, vy;
702 const int s= 1<<shift;
703
704 width--;
705 height--;
706
707 for(y=0; y<h; y++){
708 int x;
709
710 vx= ox;
711 vy= oy;
712 for(x=0; x<8; x++){ //XXX FIXME optimize
713 int src_x, src_y, frac_x, frac_y, index;
714
715 src_x= vx>>16;
716 src_y= vy>>16;
717 frac_x= src_x&(s-1);
718 frac_y= src_y&(s-1);
719 src_x>>=shift;
720 src_y>>=shift;
721
722 if((unsigned)src_x < width){
723 if((unsigned)src_y < height){
724 index= src_x + src_y*stride;
725 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
726 + src[index +1]* frac_x )*(s-frac_y)
727 + ( src[index+stride ]*(s-frac_x)
728 + src[index+stride+1]* frac_x )* frac_y
729 + r)>>(shift*2);
730 }else{
731 index= src_x + clip(src_y, 0, height)*stride;
732 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
733 + src[index +1]* frac_x )*s
734 + r)>>(shift*2);
735 }
736 }else{
737 if((unsigned)src_y < height){
738 index= clip(src_x, 0, width) + src_y*stride;
739 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
740 + src[index+stride ]* frac_y )*s
741 + r)>>(shift*2);
742 }else{
743 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
744 dst[y*stride + x]= src[index ];
745 }
746 }
747
748 vx+= dxx;
749 vy+= dyx;
750 }
751 ox += dxy;
752 oy += dyy;
753 }
754}
755
b3184779 756static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951 757{
44eb4951
MN
758 int i;
759 for(i=0; i<h; i++)
760 {
b3184779
MN
761 ST32(dst , LD32(src ));
762 ST32(dst+4 , LD32(src+4 ));
763 ST32(dst+8 , LD32(src+8 ));
764 ST32(dst+12, LD32(src+12));
765 dst[16]= src[16];
44eb4951
MN
766 dst+=dstStride;
767 src+=srcStride;
768 }
769}
770
b3184779 771static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951
MN
772{
773 int i;
b3184779 774 for(i=0; i<h; i++)
44eb4951 775 {
b3184779
MN
776 ST32(dst , LD32(src ));
777 ST32(dst+4 , LD32(src+4 ));
778 dst[8]= src[8];
44eb4951
MN
779 dst+=dstStride;
780 src+=srcStride;
781 }
782}
783
b3184779
MN
784#define QPEL_MC(r, OPNAME, RND, OP) \
785static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
786 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
787 int i;\
788 for(i=0; i<h; i++)\
789 {\
790 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
791 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
792 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
793 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
794 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
795 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
796 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
797 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
798 dst+=dstStride;\
799 src+=srcStride;\
800 }\
44eb4951
MN
801}\
802\
b3184779
MN
803static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
804 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
805 int i;\
806 for(i=0; i<w; i++)\
807 {\
808 const int src0= src[0*srcStride];\
809 const int src1= src[1*srcStride];\
810 const int src2= src[2*srcStride];\
811 const int src3= src[3*srcStride];\
812 const int src4= src[4*srcStride];\
813 const int src5= src[5*srcStride];\
814 const int src6= src[6*srcStride];\
815 const int src7= src[7*srcStride];\
816 const int src8= src[8*srcStride];\
817 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
818 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
819 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
820 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
821 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
822 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
823 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
824 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
825 dst++;\
826 src++;\
827 }\
828}\
829\
830static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
831 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
832 int i;\
833 for(i=0; i<h; i++)\
834 {\
835 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
836 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
837 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
838 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
839 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
840 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
841 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
842 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
843 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
844 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
845 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
846 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
847 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
848 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
849 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
850 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
851 dst+=dstStride;\
852 src+=srcStride;\
853 }\
854}\
855\
856static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
857 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
858 int i;\
859 for(i=0; i<w; i++)\
860 {\
861 const int src0= src[0*srcStride];\
862 const int src1= src[1*srcStride];\
863 const int src2= src[2*srcStride];\
864 const int src3= src[3*srcStride];\
865 const int src4= src[4*srcStride];\
866 const int src5= src[5*srcStride];\
867 const int src6= src[6*srcStride];\
868 const int src7= src[7*srcStride];\
869 const int src8= src[8*srcStride];\
870 const int src9= src[9*srcStride];\
871 const int src10= src[10*srcStride];\
872 const int src11= src[11*srcStride];\
873 const int src12= src[12*srcStride];\
874 const int src13= src[13*srcStride];\
875 const int src14= src[14*srcStride];\
876 const int src15= src[15*srcStride];\
877 const int src16= src[16*srcStride];\
878 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
879 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
880 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
881 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
882 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
883 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
884 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
885 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
886 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
887 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
888 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
889 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
890 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
891 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
892 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
893 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
894 dst++;\
895 src++;\
896 }\
897}\
898\
899static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
45553457 900 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
901}\
902\
903static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 904 UINT8 half[64];\
b3184779
MN
905 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
906 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
907}\
908\
b3184779
MN
909static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
910 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
911}\
912\
b3184779 913static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 914 UINT8 half[64];\
b3184779
MN
915 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
916 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
917}\
918\
b3184779
MN
919static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
920 UINT8 full[16*9];\
44eb4951 921 UINT8 half[64];\
b3184779
MN
922 copy_block9(full, src, 16, stride, 9);\
923 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
924 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
925}\
926\
b3184779
MN
927static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
928 UINT8 full[16*9];\
929 copy_block9(full, src, 16, stride, 9);\
930 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
44eb4951
MN
931}\
932\
b3184779
MN
933static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
934 UINT8 full[16*9];\
44eb4951 935 UINT8 half[64];\
b3184779
MN
936 copy_block9(full, src, 16, stride, 9);\
937 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
938 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 939}\
b3184779
MN
940static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
941 UINT8 full[16*9];\
44eb4951 942 UINT8 halfH[72];\
7ff037e9 943 UINT8 halfV[64];\
44eb4951 944 UINT8 halfHV[64];\
b3184779
MN
945 copy_block9(full, src, 16, stride, 9);\
946 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
947 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
948 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
949 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 950}\
b3184779
MN
951static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
952 UINT8 full[16*9];\
44eb4951 953 UINT8 halfH[72];\
7ff037e9 954 UINT8 halfV[64];\
44eb4951 955 UINT8 halfHV[64];\
b3184779
MN
956 copy_block9(full, src, 16, stride, 9);\
957 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
958 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
959 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
960 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 961}\
b3184779
MN
962static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
963 UINT8 full[16*9];\
44eb4951 964 UINT8 halfH[72];\
7ff037e9 965 UINT8 halfV[64];\
44eb4951 966 UINT8 halfHV[64];\
b3184779
MN
967 copy_block9(full, src, 16, stride, 9);\
968 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
969 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
970 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
971 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 972}\
b3184779
MN
973static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
974 UINT8 full[16*9];\
44eb4951 975 UINT8 halfH[72];\
7ff037e9 976 UINT8 halfV[64];\
44eb4951 977 UINT8 halfHV[64];\
b3184779
MN
978 copy_block9(full, src, 16, stride, 9);\
979 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
980 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
981 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
982 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 983}\
b3184779 984static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
985 UINT8 halfH[72];\
986 UINT8 halfHV[64];\
b3184779
MN
987 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
988 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
989 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 990}\
b3184779 991static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
992 UINT8 halfH[72];\
993 UINT8 halfHV[64];\
b3184779
MN
994 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
995 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
996 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 997}\
b3184779
MN
998static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
999 UINT8 full[16*9];\
44eb4951 1000 UINT8 halfH[72];\
7ff037e9 1001 UINT8 halfV[64];\
44eb4951 1002 UINT8 halfHV[64];\
b3184779
MN
1003 copy_block9(full, src, 16, stride, 9);\
1004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1005 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1007 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1008}\
b3184779
MN
1009static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1010 UINT8 full[16*9];\
44eb4951 1011 UINT8 halfH[72];\
7ff037e9 1012 UINT8 halfV[64];\
44eb4951 1013 UINT8 halfHV[64];\
b3184779
MN
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1018 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1019}\
b3184779 1020static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 1021 UINT8 halfH[72];\
b3184779
MN
1022 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1023 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
1024}\
1025static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
45553457 1026 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1027}\
1028\
1029static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1030 UINT8 half[256];\
1031 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1032 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1033}\
1034\
1035static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1036 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1037}\
b3184779
MN
1038\
1039static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1040 UINT8 half[256];\
1041 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1042 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1043}\
1044\
1045static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1046 UINT8 full[24*17];\
1047 UINT8 half[256];\
1048 copy_block17(full, src, 24, stride, 17);\
1049 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1050 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1051}\
1052\
1053static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1054 UINT8 full[24*17];\
1055 copy_block17(full, src, 24, stride, 17);\
1056 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1057}\
1058\
1059static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1060 UINT8 full[24*17];\
1061 UINT8 half[256];\
1062 copy_block17(full, src, 24, stride, 17);\
1063 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1064 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1065}\
1066static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1067 UINT8 full[24*17];\
1068 UINT8 halfH[272];\
1069 UINT8 halfV[256];\
1070 UINT8 halfHV[256];\
1071 copy_block17(full, src, 24, stride, 17);\
1072 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1073 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1074 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1075 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1076}\
1077static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1078 UINT8 full[24*17];\
1079 UINT8 halfH[272];\
1080 UINT8 halfV[256];\
1081 UINT8 halfHV[256];\
1082 copy_block17(full, src, 24, stride, 17);\
1083 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1084 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1085 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1086 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1087}\
1088static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1089 UINT8 full[24*17];\
1090 UINT8 halfH[272];\
1091 UINT8 halfV[256];\
1092 UINT8 halfHV[256];\
1093 copy_block17(full, src, 24, stride, 17);\
1094 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1095 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1096 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1097 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1098}\
1099static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1100 UINT8 full[24*17];\
1101 UINT8 halfH[272];\
1102 UINT8 halfV[256];\
1103 UINT8 halfHV[256];\
1104 copy_block17(full, src, 24, stride, 17);\
1105 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1106 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1107 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1108 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1109}\
1110static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1111 UINT8 halfH[272];\
1112 UINT8 halfHV[256];\
1113 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1114 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1115 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1116}\
1117static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1118 UINT8 halfH[272];\
1119 UINT8 halfHV[256];\
1120 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1121 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1122 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1123}\
1124static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1125 UINT8 full[24*17];\
1126 UINT8 halfH[272];\
1127 UINT8 halfV[256];\
1128 UINT8 halfHV[256];\
1129 copy_block17(full, src, 24, stride, 17);\
1130 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1131 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1132 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1133 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1134}\
1135static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1136 UINT8 full[24*17];\
1137 UINT8 halfH[272];\
1138 UINT8 halfV[256];\
1139 UINT8 halfHV[256];\
1140 copy_block17(full, src, 24, stride, 17);\
1141 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1143 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1144 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1145}\
1146static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1147 UINT8 halfH[272];\
1148 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1149 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
45553457 1150}
44eb4951 1151
b3184779
MN
1152#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1153#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1154#define op_put(a, b) a = cm[((b) + 16)>>5]
1155#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1156
1157QPEL_MC(0, put_ , _ , op_put)
1158QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1159QPEL_MC(0, avg_ , _ , op_avg)
1160//QPEL_MC(1, avg_no_rnd , _ , op_avg)
1161#undef op_avg
1162#undef op_avg_no_rnd
1163#undef op_put
1164#undef op_put_no_rnd
44eb4951 1165
1457ab52
MN
1166static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1167 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1168 int i;
1169
1170 for(i=0; i<h; i++){
1171 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1172 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1173 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1174 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1175 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1176 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1177 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1178 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1179 dst+=dstStride;
1180 src+=srcStride;
1181 }
1182}
1183
1184static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1185 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1186 int i;
1187
1188 for(i=0; i<w; i++){
1189 const int src_1= src[ -srcStride];
1190 const int src0 = src[0 ];
1191 const int src1 = src[ srcStride];
1192 const int src2 = src[2*srcStride];
1193 const int src3 = src[3*srcStride];
1194 const int src4 = src[4*srcStride];
1195 const int src5 = src[5*srcStride];
1196 const int src6 = src[6*srcStride];
1197 const int src7 = src[7*srcStride];
1198 const int src8 = src[8*srcStride];
1199 const int src9 = src[9*srcStride];
1200 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1201 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1202 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1203 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1204 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1205 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1206 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1207 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1208 src++;
1209 dst++;
1210 }
1211}
1212
1213static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1214 put_pixels8_c(dst, src, stride, 8);
1215}
1216
1217static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1218 uint8_t half[64];
1219 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1220 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1221}
1222
1223static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1224 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1225}
1226
1227static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1228 uint8_t half[64];
1229 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1230 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1231}
1232
1233static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1234 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1235}
1236
1237static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1238 uint8_t halfH[88];
1239 uint8_t halfV[64];
1240 uint8_t halfHV[64];
1241 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1242 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1243 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1244 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1245}
1246static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1247 uint8_t halfH[88];
1248 uint8_t halfV[64];
1249 uint8_t halfHV[64];
1250 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1251 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1252 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1253 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1254}
1255static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1256 uint8_t halfH[88];
1257 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1258 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1259}
1260
1261
1262static inline int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1263{
1264 int s, i;
1265
1266 s = 0;
ba6802de 1267 for(i=0;i<16;i++) {
de6d9b64
FB
1268 s += abs(pix1[0] - pix2[0]);
1269 s += abs(pix1[1] - pix2[1]);
1270 s += abs(pix1[2] - pix2[2]);
1271 s += abs(pix1[3] - pix2[3]);
1272 s += abs(pix1[4] - pix2[4]);
1273 s += abs(pix1[5] - pix2[5]);
1274 s += abs(pix1[6] - pix2[6]);
1275 s += abs(pix1[7] - pix2[7]);
1276 s += abs(pix1[8] - pix2[8]);
1277 s += abs(pix1[9] - pix2[9]);
1278 s += abs(pix1[10] - pix2[10]);
1279 s += abs(pix1[11] - pix2[11]);
1280 s += abs(pix1[12] - pix2[12]);
1281 s += abs(pix1[13] - pix2[13]);
1282 s += abs(pix1[14] - pix2[14]);
1283 s += abs(pix1[15] - pix2[15]);
1284 pix1 += line_size;
1285 pix2 += line_size;
1286 }
1287 return s;
1288}
1289
eb4b3dd3 1290static int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1291{
1292 int s, i;
1293
1294 s = 0;
ba6802de 1295 for(i=0;i<16;i++) {
de6d9b64
FB
1296 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1297 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1298 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1299 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1300 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1301 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1302 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1303 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1304 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1305 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1306 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1307 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1308 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1309 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1310 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1311 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1312 pix1 += line_size;
1313 pix2 += line_size;
1314 }
1315 return s;
1316}
1317
eb4b3dd3 1318static int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1319{
1320 int s, i;
1321 UINT8 *pix3 = pix2 + line_size;
1322
1323 s = 0;
ba6802de 1324 for(i=0;i<16;i++) {
de6d9b64
FB
1325 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1326 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1327 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1328 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1329 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1330 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1331 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1332 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1333 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1334 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1335 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1336 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1337 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1338 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1339 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1340 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1341 pix1 += line_size;
1342 pix2 += line_size;
1343 pix3 += line_size;
1344 }
1345 return s;
1346}
1347
eb4b3dd3 1348static int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1349{
1350 int s, i;
1351 UINT8 *pix3 = pix2 + line_size;
1352
1353 s = 0;
ba6802de 1354 for(i=0;i<16;i++) {
de6d9b64
FB
1355 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1356 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1357 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1358 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1359 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1360 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1361 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1362 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1363 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1364 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1365 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1366 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1367 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1368 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1369 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1370 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1371 pix1 += line_size;
1372 pix2 += line_size;
1373 pix3 += line_size;
1374 }
1375 return s;
1376}
1377
1457ab52 1378static inline int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1379{
1380 int s, i;
1381
1382 s = 0;
1383 for(i=0;i<8;i++) {
1384 s += abs(pix1[0] - pix2[0]);
1385 s += abs(pix1[1] - pix2[1]);
1386 s += abs(pix1[2] - pix2[2]);
1387 s += abs(pix1[3] - pix2[3]);
1388 s += abs(pix1[4] - pix2[4]);
1389 s += abs(pix1[5] - pix2[5]);
1390 s += abs(pix1[6] - pix2[6]);
1391 s += abs(pix1[7] - pix2[7]);
1392 pix1 += line_size;
1393 pix2 += line_size;
1394 }
1395 return s;
1396}
1397
eb4b3dd3 1398static int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1399{
1400 int s, i;
1401
1402 s = 0;
1403 for(i=0;i<8;i++) {
1404 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1405 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1406 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1407 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1408 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1409 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1410 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1411 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1412 pix1 += line_size;
1413 pix2 += line_size;
1414 }
1415 return s;
1416}
1417
eb4b3dd3 1418static int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1419{
1420 int s, i;
1421 UINT8 *pix3 = pix2 + line_size;
1422
1423 s = 0;
1424 for(i=0;i<8;i++) {
1425 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1426 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1427 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1428 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1429 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1430 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1431 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1432 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1433 pix1 += line_size;
1434 pix2 += line_size;
1435 pix3 += line_size;
1436 }
1437 return s;
1438}
1439
eb4b3dd3 1440static int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1441{
1442 int s, i;
1443 UINT8 *pix3 = pix2 + line_size;
1444
1445 s = 0;
1446 for(i=0;i<8;i++) {
1447 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1448 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1449 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1450 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1451 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1452 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1453 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1454 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1455 pix1 += line_size;
1456 pix2 += line_size;
1457 pix3 += line_size;
1458 }
1459 return s;
1460}
1461
1457ab52
MN
1462static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
1463 return pix_abs16x16_c(a,b,stride);
1464}
1465
1466static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
1467 return pix_abs8x8_c(a,b,stride);
1468}
1469
477ab036 1470void ff_block_permute(DCTELEM *block, UINT8 *permutation, const UINT8 *scantable, int last)
d962f6fd 1471{
7801d21d 1472 int i;
477ab036 1473 DCTELEM temp[64];
7801d21d
MN
1474
1475 if(last<=0) return;
9a7b310d 1476 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 1477
7801d21d
MN
1478 for(i=0; i<=last; i++){
1479 const int j= scantable[i];
1480 temp[j]= block[j];
1481 block[j]=0;
1482 }
1483
1484 for(i=0; i<=last; i++){
1485 const int j= scantable[i];
1486 const int perm_j= permutation[j];
1487 block[perm_j]= temp[j];
1488 }
d962f6fd 1489}
e0eac44e 1490
eb4b3dd3 1491static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
1492{
1493 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1494}
1495
11f18faf
MN
1496static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1497 int i;
1498 for(i=0; i+7<w; i++){
1499 dst[i+0] += src[i+0];
1500 dst[i+1] += src[i+1];
1501 dst[i+2] += src[i+2];
1502 dst[i+3] += src[i+3];
1503 dst[i+4] += src[i+4];
1504 dst[i+5] += src[i+5];
1505 dst[i+6] += src[i+6];
1506 dst[i+7] += src[i+7];
1507 }
1508 for(; i<w; i++)
1509 dst[i+0] += src[i+0];
1510}
1511
1512static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1513 int i;
1514 for(i=0; i+7<w; i++){
1515 dst[i+0] = src1[i+0]-src2[i+0];
1516 dst[i+1] = src1[i+1]-src2[i+1];
1517 dst[i+2] = src1[i+2]-src2[i+2];
1518 dst[i+3] = src1[i+3]-src2[i+3];
1519 dst[i+4] = src1[i+4]-src2[i+4];
1520 dst[i+5] = src1[i+5]-src2[i+5];
1521 dst[i+6] = src1[i+6]-src2[i+6];
1522 dst[i+7] = src1[i+7]-src2[i+7];
1523 }
1524 for(; i<w; i++)
1525 dst[i+0] = src1[i+0]-src2[i+0];
1526}
1527
1457ab52
MN
1528#define BUTTERFLY2(o1,o2,i1,i2) \
1529o1= (i1)+(i2);\
1530o2= (i1)-(i2);
1531
1532#define BUTTERFLY1(x,y) \
1533{\
1534 int a,b;\
1535 a= x;\
1536 b= y;\
1537 x= a+b;\
1538 y= a-b;\
1539}
1540
1541#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
1542
1543static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
1544 int i;
1545 int temp[64];
1546 int sum=0;
1547
1548 for(i=0; i<8; i++){
1549 //FIXME try pointer walks
1550 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1551 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1552 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1553 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1554
1555 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1556 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1557 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1558 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1559
1560 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1561 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1562 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1563 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1564 }
1565
1566 for(i=0; i<8; i++){
1567 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1568 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1569 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1570 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1571
1572 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1573 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1574 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1575 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1576
1577 sum +=
1578 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1579 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1580 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1581 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1582 }
1583#if 0
1584static int maxi=0;
1585if(sum>maxi){
1586 maxi=sum;
1587 printf("MAX:%d\n", maxi);
1588}
1589#endif
1590 return sum;
1591}
1592
1593static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
1594 int i;
1595 int temp[64];
1596 int sum=0;
1597//FIXME OOOPS ignore 0 term instead of mean mess
1598 for(i=0; i<8; i++){
1599 //FIXME try pointer walks
1600 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
1601 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
1602 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
1603 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
1604
1605 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1606 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1607 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1608 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1609
1610 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1611 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1612 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1613 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1614 }
1615
1616 for(i=0; i<8; i++){
1617 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1618 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1619 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1620 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1621
1622 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1623 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1624 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1625 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1626
1627 sum +=
1628 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1629 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1630 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1631 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1632 }
1633
1634 return sum;
1635}
1636
1637static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1638 MpegEncContext * const s= (MpegEncContext *)c;
1639 DCTELEM temp[64];
1640 int sum=0, i;
1641
1642 s->dsp.diff_pixels(temp, src1, src2, stride);
1643 s->fdct(temp);
1644
1645 for(i=0; i<64; i++)
1646 sum+= ABS(temp[i]);
1647
1648 return sum;
1649}
1650
1651void simple_idct(INT16 *block); //FIXME
1652
1653static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1654 MpegEncContext * const s= (MpegEncContext *)c;
1655 DCTELEM temp[64], bak[64];
1656 int sum=0, i;
1657
1658 s->mb_intra=0;
1659
1660 s->dsp.diff_pixels(temp, src1, src2, stride);
1661
1662 memcpy(bak, temp, 64*sizeof(DCTELEM));
1663
1664 s->dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1665 s->dct_unquantize(s, temp, 0, s->qscale);
1666 simple_idct(temp); //FIXME
1667
1668 for(i=0; i<64; i++)
1669 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
1670
1671 return sum;
1672}
1673
1674WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
1675WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
1676WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
1677
eb4b3dd3 1678void dsputil_init(DSPContext* c, unsigned mask)
e0eac44e 1679{
5abd509a 1680 static int init_done = 0;
d2975f8d 1681 int i;
e0eac44e 1682
5abd509a
ZK
1683 if (!init_done) {
1684 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1685 for(i=0;i<MAX_NEG_CROP;i++) {
1686 cropTbl[i] = 0;
1687 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1688 }
de6d9b64 1689
5abd509a
ZK
1690 for(i=0;i<512;i++) {
1691 squareTbl[i] = (i - 256) * (i - 256);
1692 }
92ddb692
ZK
1693
1694 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
1695
1696 init_done = 1;
de6d9b64
FB
1697 }
1698
eb4b3dd3
ZK
1699 c->get_pixels = get_pixels_c;
1700 c->diff_pixels = diff_pixels_c;
1701 c->put_pixels_clamped = put_pixels_clamped_c;
1702 c->add_pixels_clamped = add_pixels_clamped_c;
1703 c->gmc1 = gmc1_c;
1704 c->gmc = gmc_c;
1705 c->clear_blocks = clear_blocks_c;
1706 c->pix_sum = pix_sum_c;
1707 c->pix_norm1 = pix_norm1_c;
1457ab52
MN
1708 c->sse[0]= sse16_c;
1709 c->sse[1]= sse8_c;
eb4b3dd3 1710
45553457 1711 /* TODO [0] 16 [1] 8 */
eb4b3dd3
ZK
1712 c->pix_abs16x16 = pix_abs16x16_c;
1713 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
1714 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
1715 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1716 c->pix_abs8x8 = pix_abs8x8_c;
1717 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
1718 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
1719 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1720
45553457
ZK
1721#define dspfunc(PFX, IDX, NUM) \
1722 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
1723 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
1724 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
1725 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
1726
1727 dspfunc(put, 0, 16);
1728 dspfunc(put_no_rnd, 0, 16);
1729 dspfunc(put, 1, 8);
1730 dspfunc(put_no_rnd, 1, 8);
1731
1732 dspfunc(avg, 0, 16);
1733 dspfunc(avg_no_rnd, 0, 16);
1734 dspfunc(avg, 1, 8);
1735 dspfunc(avg_no_rnd, 1, 8);
1736#undef dspfunc
1737
1738#define dspfunc(PFX, IDX, NUM) \
1739 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
1740 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
1741 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
1742 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
1743 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
1744 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
1745 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
1746 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
1747 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
1748 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
1749 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
1750 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
1751 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
1752 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
1753 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
1754 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
1755
1756 dspfunc(put_qpel, 0, 16);
1757 dspfunc(put_no_rnd_qpel, 0, 16);
1758
1759 dspfunc(avg_qpel, 0, 16);
1760 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
1761
1762 dspfunc(put_qpel, 1, 8);
1763 dspfunc(put_no_rnd_qpel, 1, 8);
1764
1765 dspfunc(avg_qpel, 1, 8);
1766 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
1767#undef dspfunc
c9a2ebc4 1768
1457ab52
MN
1769 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
1770 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
1771 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
1772 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
1773 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
1774 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
1775 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
1776 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
1777
1778 c->hadamard8_diff[0]= hadamard8_diff16_c;
1779 c->hadamard8_diff[1]= hadamard8_diff_c;
1780 c->hadamard8_abs = hadamard8_abs_c;
1781
1782 c->dct_sad[0]= dct_sad16x16_c;
1783 c->dct_sad[1]= dct_sad8x8_c;
1784
1785 c->sad[0]= sad16x16_c;
1786 c->sad[1]= sad8x8_c;
1787
1788 c->quant_psnr[0]= quant_psnr16x16_c;
1789 c->quant_psnr[1]= quant_psnr8x8_c;
1790
11f18faf
MN
1791 c->add_bytes= add_bytes_c;
1792 c->diff_bytes= diff_bytes_c;
1793
980fc7b8 1794#ifdef HAVE_MMX
eb4b3dd3 1795 dsputil_init_mmx(c, mask);
34dfe896
ZK
1796 if (ff_bit_exact)
1797 {
1798 /* FIXME - AVCodec context should have flag for bitexact match */
1799 /* fprintf(stderr, "\n\n\nff_bit_exact %d\n\n\n\n", ff_bit_exact); */
1800 dsputil_set_bit_exact_mmx(c, mask);
1801 }
de6d9b64 1802#endif
3d03c0a2 1803#ifdef ARCH_ARMV4L
eb4b3dd3 1804 dsputil_init_armv4l(c, mask);
3d03c0a2 1805#endif
c34270f5 1806#ifdef HAVE_MLIB
eb4b3dd3 1807 dsputil_init_mlib(c, mask);
c34270f5 1808#endif
1e98dffb 1809#ifdef ARCH_ALPHA
eb4b3dd3 1810 dsputil_init_alpha(c, mask);
1e98dffb 1811#endif
59925ef2 1812#ifdef ARCH_POWERPC
eb4b3dd3 1813 dsputil_init_ppc(c, mask);
a43bd1d7 1814#endif
d46aba26 1815#ifdef HAVE_MMI
eb4b3dd3 1816 dsputil_init_mmi(c, mask);
d46aba26 1817#endif
de6d9b64 1818}
43f1708f 1819
57060b1e
FB
1820/* remove any non bit exact operation (testing purpose) */
1821void avcodec_set_bit_exact(void)
1822{
5596c60c 1823 ff_bit_exact=1;
57060b1e 1824#ifdef HAVE_MMX
34dfe896 1825// FIXME - better set_bit_exact
eb4b3dd3 1826// dsputil_set_bit_exact_mmx();
57060b1e
FB
1827#endif
1828}