* useless commit - ignore
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
23
de6d9b64 24void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
9dbcbd92 25void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
de6d9b64
FB
26void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
27void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
073b013d
MN
28void (*ff_gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
29void (*ff_gmc )(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy,
30 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
649c00c9 31void (*clear_blocks)(DCTELEM *blocks);
3aa102be
MN
32int (*pix_sum)(UINT8 * pix, int line_size);
33int (*pix_norm1)(UINT8 * pix, int line_size);
de6d9b64
FB
34
35op_pixels_abs_func pix_abs16x16;
36op_pixels_abs_func pix_abs16x16_x2;
37op_pixels_abs_func pix_abs16x16_y2;
38op_pixels_abs_func pix_abs16x16_xy2;
39
ba6802de
MN
40op_pixels_abs_func pix_abs8x8;
41op_pixels_abs_func pix_abs8x8_x2;
42op_pixels_abs_func pix_abs8x8_y2;
43op_pixels_abs_func pix_abs8x8_xy2;
44
5596c60c
MN
45int ff_bit_exact=0;
46
0cfa9713 47UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
48UINT32 squareTbl[512];
49
2ad1516a
MN
50const UINT8 ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 53 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 54 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
59};
60
2f349de2
MN
61/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62UINT16 __align8 inv_zigzag_direct16[64];
63
2ad1516a
MN
64const UINT8 ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73};
74
2ad1516a
MN
75const UINT8 ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84};
85
2f349de2
MN
86/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87UINT32 inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120};
121
3aa102be
MN
122int pix_sum_c(UINT8 * pix, int line_size)
123{
124 int s, i, j;
125
126 s = 0;
127 for (i = 0; i < 16; i++) {
128 for (j = 0; j < 16; j += 8) {
129 s += pix[0];
130 s += pix[1];
131 s += pix[2];
132 s += pix[3];
133 s += pix[4];
134 s += pix[5];
135 s += pix[6];
136 s += pix[7];
137 pix += 8;
138 }
139 pix += line_size - 16;
140 }
141 return s;
142}
143
144int pix_norm1_c(UINT8 * pix, int line_size)
145{
146 int s, i, j;
147 UINT32 *sq = squareTbl + 256;
148
149 s = 0;
150 for (i = 0; i < 16; i++) {
151 for (j = 0; j < 16; j += 8) {
152 s += sq[pix[0]];
153 s += sq[pix[1]];
154 s += sq[pix[2]];
155 s += sq[pix[3]];
156 s += sq[pix[4]];
157 s += sq[pix[5]];
158 s += sq[pix[6]];
159 s += sq[pix[7]];
160 pix += 8;
161 }
162 pix += line_size - 16;
163 }
164 return s;
165}
166
167
c13e1abd 168void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
de6d9b64 169{
de6d9b64
FB
170 int i;
171
172 /* read the pixels */
de6d9b64 173 for(i=0;i<8;i++) {
c13e1abd
FH
174 block[0] = pixels[0];
175 block[1] = pixels[1];
176 block[2] = pixels[2];
177 block[3] = pixels[3];
178 block[4] = pixels[4];
179 block[5] = pixels[5];
180 block[6] = pixels[6];
181 block[7] = pixels[7];
182 pixels += line_size;
183 block += 8;
de6d9b64
FB
184 }
185}
186
c13e1abd
FH
187void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
188 int stride){
9dbcbd92
MN
189 int i;
190
191 /* read the pixels */
9dbcbd92 192 for(i=0;i<8;i++) {
c13e1abd
FH
193 block[0] = s1[0] - s2[0];
194 block[1] = s1[1] - s2[1];
195 block[2] = s1[2] - s2[2];
196 block[3] = s1[3] - s2[3];
197 block[4] = s1[4] - s2[4];
198 block[5] = s1[5] - s2[5];
199 block[6] = s1[6] - s2[6];
200 block[7] = s1[7] - s2[7];
9dbcbd92
MN
201 s1 += stride;
202 s2 += stride;
c13e1abd 203 block += 8;
9dbcbd92
MN
204 }
205}
206
207
c13e1abd
FH
208void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
209 int line_size)
de6d9b64 210{
de6d9b64
FB
211 int i;
212 UINT8 *cm = cropTbl + MAX_NEG_CROP;
213
214 /* read the pixels */
de6d9b64 215 for(i=0;i<8;i++) {
c13e1abd
FH
216 pixels[0] = cm[block[0]];
217 pixels[1] = cm[block[1]];
218 pixels[2] = cm[block[2]];
219 pixels[3] = cm[block[3]];
220 pixels[4] = cm[block[4]];
221 pixels[5] = cm[block[5]];
222 pixels[6] = cm[block[6]];
223 pixels[7] = cm[block[7]];
224
225 pixels += line_size;
226 block += 8;
de6d9b64
FB
227 }
228}
229
c13e1abd
FH
230void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
231 int line_size)
de6d9b64 232{
de6d9b64
FB
233 int i;
234 UINT8 *cm = cropTbl + MAX_NEG_CROP;
235
236 /* read the pixels */
de6d9b64 237 for(i=0;i<8;i++) {
c13e1abd
FH
238 pixels[0] = cm[pixels[0] + block[0]];
239 pixels[1] = cm[pixels[1] + block[1]];
240 pixels[2] = cm[pixels[2] + block[2]];
241 pixels[3] = cm[pixels[3] + block[3]];
242 pixels[4] = cm[pixels[4] + block[4]];
243 pixels[5] = cm[pixels[5] + block[5]];
244 pixels[6] = cm[pixels[6] + block[6]];
245 pixels[7] = cm[pixels[7] + block[7]];
246 pixels += line_size;
247 block += 8;
de6d9b64
FB
248 }
249}
59fe111e
MN
250#if 0
251
252#define PIXOP2(OPNAME, OP) \
b3184779 253static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
254{\
255 int i;\
256 for(i=0; i<h; i++){\
257 OP(*((uint64_t*)block), LD64(pixels));\
258 pixels+=line_size;\
259 block +=line_size;\
260 }\
261}\
262\
b3184779 263static void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
264{\
265 int i;\
266 for(i=0; i<h; i++){\
267 const uint64_t a= LD64(pixels );\
268 const uint64_t b= LD64(pixels+1);\
269 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
270 pixels+=line_size;\
271 block +=line_size;\
272 }\
273}\
274\
b3184779 275static void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
276{\
277 int i;\
278 for(i=0; i<h; i++){\
279 const uint64_t a= LD64(pixels );\
280 const uint64_t b= LD64(pixels+1);\
281 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
282 pixels+=line_size;\
283 block +=line_size;\
284 }\
285}\
286\
b3184779 287static void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
288{\
289 int i;\
290 for(i=0; i<h; i++){\
291 const uint64_t a= LD64(pixels );\
292 const uint64_t b= LD64(pixels+line_size);\
293 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
294 pixels+=line_size;\
295 block +=line_size;\
296 }\
297}\
298\
b3184779 299static void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
300{\
301 int i;\
302 for(i=0; i<h; i++){\
303 const uint64_t a= LD64(pixels );\
304 const uint64_t b= LD64(pixels+line_size);\
305 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
306 pixels+=line_size;\
307 block +=line_size;\
308 }\
309}\
310\
b3184779 311static void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
312{\
313 int i;\
314 const uint64_t a= LD64(pixels );\
315 const uint64_t b= LD64(pixels+1);\
316 uint64_t l0= (a&0x0303030303030303ULL)\
317 + (b&0x0303030303030303ULL)\
318 + 0x0202020202020202ULL;\
319 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
320 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
321 uint64_t l1,h1;\
322\
323 pixels+=line_size;\
324 for(i=0; i<h; i+=2){\
325 uint64_t a= LD64(pixels );\
326 uint64_t b= LD64(pixels+1);\
327 l1= (a&0x0303030303030303ULL)\
328 + (b&0x0303030303030303ULL);\
329 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
330 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
331 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
332 pixels+=line_size;\
333 block +=line_size;\
334 a= LD64(pixels );\
335 b= LD64(pixels+1);\
336 l0= (a&0x0303030303030303ULL)\
337 + (b&0x0303030303030303ULL)\
338 + 0x0202020202020202ULL;\
339 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
340 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
341 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
342 pixels+=line_size;\
343 block +=line_size;\
344 }\
345}\
346\
b3184779 347static void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
348{\
349 int i;\
350 const uint64_t a= LD64(pixels );\
351 const uint64_t b= LD64(pixels+1);\
352 uint64_t l0= (a&0x0303030303030303ULL)\
353 + (b&0x0303030303030303ULL)\
354 + 0x0101010101010101ULL;\
355 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
356 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
357 uint64_t l1,h1;\
358\
359 pixels+=line_size;\
360 for(i=0; i<h; i+=2){\
361 uint64_t a= LD64(pixels );\
362 uint64_t b= LD64(pixels+1);\
363 l1= (a&0x0303030303030303ULL)\
364 + (b&0x0303030303030303ULL);\
365 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
366 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
367 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
368 pixels+=line_size;\
369 block +=line_size;\
370 a= LD64(pixels );\
371 b= LD64(pixels+1);\
372 l0= (a&0x0303030303030303ULL)\
373 + (b&0x0303030303030303ULL)\
374 + 0x0101010101010101ULL;\
375 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
376 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
377 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
378 pixels+=line_size;\
379 block +=line_size;\
380 }\
381}\
382\
b3184779
MN
383CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels , 8)\
384CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels_x2 , 8)\
385CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels_y2 , 8)\
386CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels_xy2, 8)\
387CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels_x2 , 8)\
388CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels_y2 , 8)\
389CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels_xy2, 8)\
390\
391void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
392 {\
393 OPNAME ## _pixels,\
394 OPNAME ## _pixels_x2,\
395 OPNAME ## _pixels_y2,\
396 OPNAME ## _pixels_xy2},\
397 {\
398 OPNAME ## _pixels16,\
399 OPNAME ## _pixels16_x2,\
400 OPNAME ## _pixels16_y2,\
401 OPNAME ## _pixels16_xy2}\
59fe111e
MN
402};\
403\
b3184779
MN
404void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
405 {\
406 OPNAME ## _pixels,\
407 OPNAME ## _no_rnd_pixels_x2,\
408 OPNAME ## _no_rnd_pixels_y2,\
409 OPNAME ## _no_rnd_pixels_xy2},\
410 {\
411 OPNAME ## _pixels16,\
412 OPNAME ## _no_rnd_pixels16_x2,\
413 OPNAME ## _no_rnd_pixels16_y2,\
414 OPNAME ## _no_rnd_pixels16_xy2}\
59fe111e
MN
415};
416
417#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
418#else // 64 bit variant
419
420#define PIXOP2(OPNAME, OP) \
b3184779 421static void OPNAME ## _pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
422 int i;\
423 for(i=0; i<h; i++){\
424 OP(*((uint32_t*)(block )), LD32(pixels ));\
425 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
426 pixels+=line_size;\
427 block +=line_size;\
428 }\
429}\
b3184779
MN
430static inline void OPNAME ## _no_rnd_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
431 OPNAME ## _pixels8(block, pixels, line_size, h);\
432}\
59fe111e 433\
b3184779
MN
434static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
435 int src_stride1, int src_stride2, int h){\
59fe111e
MN
436 int i;\
437 for(i=0; i<h; i++){\
b3184779
MN
438 uint32_t a,b;\
439 a= LD32(&src1[i*src_stride1 ]);\
440 b= LD32(&src2[i*src_stride2 ]);\
441 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
442 a= LD32(&src1[i*src_stride1+4]);\
443 b= LD32(&src2[i*src_stride2+4]);\
444 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
445 }\
446}\
447\
b3184779
MN
448static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
449 int src_stride1, int src_stride2, int h){\
59fe111e
MN
450 int i;\
451 for(i=0; i<h; i++){\
b3184779
MN
452 uint32_t a,b;\
453 a= LD32(&src1[i*src_stride1 ]);\
454 b= LD32(&src2[i*src_stride2 ]);\
455 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
456 a= LD32(&src1[i*src_stride1+4]);\
457 b= LD32(&src2[i*src_stride2+4]);\
458 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
459 }\
460}\
461\
b3184779
MN
462static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
463 int src_stride1, int src_stride2, int h){\
464 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
465 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
466}\
467\
468static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
469 int src_stride1, int src_stride2, int h){\
470 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
471 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
472}\
473\
474static inline void OPNAME ## _no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
475 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
476}\
477\
478static inline void OPNAME ## _pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
479 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
480}\
481\
482static inline void OPNAME ## _no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
483 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
484}\
485\
486static inline void OPNAME ## _pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
487 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
488}\
489\
490static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
491 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
492 int i;\
493 for(i=0; i<h; i++){\
b3184779
MN
494 uint32_t a, b, c, d, l0, l1, h0, h1;\
495 a= LD32(&src1[i*src_stride1]);\
496 b= LD32(&src2[i*src_stride2]);\
497 c= LD32(&src3[i*src_stride3]);\
498 d= LD32(&src4[i*src_stride4]);\
499 l0= (a&0x03030303UL)\
500 + (b&0x03030303UL)\
501 + 0x02020202UL;\
502 h0= ((a&0xFCFCFCFCUL)>>2)\
503 + ((b&0xFCFCFCFCUL)>>2);\
504 l1= (c&0x03030303UL)\
505 + (d&0x03030303UL);\
506 h1= ((c&0xFCFCFCFCUL)>>2)\
507 + ((d&0xFCFCFCFCUL)>>2);\
508 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
509 a= LD32(&src1[i*src_stride1+4]);\
510 b= LD32(&src2[i*src_stride2+4]);\
511 c= LD32(&src3[i*src_stride3+4]);\
512 d= LD32(&src4[i*src_stride4+4]);\
513 l0= (a&0x03030303UL)\
514 + (b&0x03030303UL)\
515 + 0x02020202UL;\
516 h0= ((a&0xFCFCFCFCUL)>>2)\
517 + ((b&0xFCFCFCFCUL)>>2);\
518 l1= (c&0x03030303UL)\
519 + (d&0x03030303UL);\
520 h1= ((c&0xFCFCFCFCUL)>>2)\
521 + ((d&0xFCFCFCFCUL)>>2);\
522 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
523 }\
524}\
b3184779
MN
525static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
526 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
527 int i;\
528 for(i=0; i<h; i++){\
b3184779
MN
529 uint32_t a, b, c, d, l0, l1, h0, h1;\
530 a= LD32(&src1[i*src_stride1]);\
531 b= LD32(&src2[i*src_stride2]);\
532 c= LD32(&src3[i*src_stride3]);\
533 d= LD32(&src4[i*src_stride4]);\
534 l0= (a&0x03030303UL)\
535 + (b&0x03030303UL)\
536 + 0x01010101UL;\
537 h0= ((a&0xFCFCFCFCUL)>>2)\
538 + ((b&0xFCFCFCFCUL)>>2);\
539 l1= (c&0x03030303UL)\
540 + (d&0x03030303UL);\
541 h1= ((c&0xFCFCFCFCUL)>>2)\
542 + ((d&0xFCFCFCFCUL)>>2);\
543 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
544 a= LD32(&src1[i*src_stride1+4]);\
545 b= LD32(&src2[i*src_stride2+4]);\
546 c= LD32(&src3[i*src_stride3+4]);\
547 d= LD32(&src4[i*src_stride4+4]);\
548 l0= (a&0x03030303UL)\
549 + (b&0x03030303UL)\
550 + 0x01010101UL;\
551 h0= ((a&0xFCFCFCFCUL)>>2)\
552 + ((b&0xFCFCFCFCUL)>>2);\
553 l1= (c&0x03030303UL)\
554 + (d&0x03030303UL);\
555 h1= ((c&0xFCFCFCFCUL)>>2)\
556 + ((d&0xFCFCFCFCUL)>>2);\
557 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
558 }\
559}\
b3184779
MN
560static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
561 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
562 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
563 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
564}\
565static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
566 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
567 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
568 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
569}\
59fe111e 570\
b3184779 571static inline void OPNAME ## _pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
572{\
573 int j;\
574 for(j=0; j<2; j++){\
575 int i;\
576 const uint32_t a= LD32(pixels );\
577 const uint32_t b= LD32(pixels+1);\
578 uint32_t l0= (a&0x03030303UL)\
579 + (b&0x03030303UL)\
580 + 0x02020202UL;\
581 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
582 + ((b&0xFCFCFCFCUL)>>2);\
583 uint32_t l1,h1;\
584\
585 pixels+=line_size;\
586 for(i=0; i<h; i+=2){\
587 uint32_t a= LD32(pixels );\
588 uint32_t b= LD32(pixels+1);\
589 l1= (a&0x03030303UL)\
590 + (b&0x03030303UL);\
591 h1= ((a&0xFCFCFCFCUL)>>2)\
592 + ((b&0xFCFCFCFCUL)>>2);\
593 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
594 pixels+=line_size;\
595 block +=line_size;\
596 a= LD32(pixels );\
597 b= LD32(pixels+1);\
598 l0= (a&0x03030303UL)\
599 + (b&0x03030303UL)\
600 + 0x02020202UL;\
601 h0= ((a&0xFCFCFCFCUL)>>2)\
602 + ((b&0xFCFCFCFCUL)>>2);\
603 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
604 pixels+=line_size;\
605 block +=line_size;\
606 }\
607 pixels+=4-line_size*(h+1);\
608 block +=4-line_size*h;\
609 }\
610}\
611\
b3184779 612static inline void OPNAME ## _no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
613{\
614 int j;\
615 for(j=0; j<2; j++){\
616 int i;\
617 const uint32_t a= LD32(pixels );\
618 const uint32_t b= LD32(pixels+1);\
619 uint32_t l0= (a&0x03030303UL)\
620 + (b&0x03030303UL)\
621 + 0x01010101UL;\
622 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
623 + ((b&0xFCFCFCFCUL)>>2);\
624 uint32_t l1,h1;\
625\
626 pixels+=line_size;\
627 for(i=0; i<h; i+=2){\
628 uint32_t a= LD32(pixels );\
629 uint32_t b= LD32(pixels+1);\
630 l1= (a&0x03030303UL)\
631 + (b&0x03030303UL);\
632 h1= ((a&0xFCFCFCFCUL)>>2)\
633 + ((b&0xFCFCFCFCUL)>>2);\
634 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
635 pixels+=line_size;\
636 block +=line_size;\
637 a= LD32(pixels );\
638 b= LD32(pixels+1);\
639 l0= (a&0x03030303UL)\
640 + (b&0x03030303UL)\
641 + 0x01010101UL;\
642 h0= ((a&0xFCFCFCFCUL)>>2)\
643 + ((b&0xFCFCFCFCUL)>>2);\
644 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
645 pixels+=line_size;\
646 block +=line_size;\
647 }\
648 pixels+=4-line_size*(h+1);\
649 block +=4-line_size*h;\
650 }\
651}\
652\
b3184779
MN
653CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels8 , 8)\
654CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels8_x2 , 8)\
655CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels8_y2 , 8)\
656CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels8_xy2, 8)\
657CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16 , OPNAME ## _pixels8 , 8)\
658CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels8_x2 , 8)\
659CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels8_y2 , 8)\
660CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels8_xy2, 8)\
661\
662void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
663 {\
664 OPNAME ## _pixels16,\
665 OPNAME ## _pixels16_x2,\
666 OPNAME ## _pixels16_y2,\
667 OPNAME ## _pixels16_xy2},\
668 {\
669 OPNAME ## _pixels8,\
670 OPNAME ## _pixels8_x2,\
671 OPNAME ## _pixels8_y2,\
672 OPNAME ## _pixels8_xy2},\
59fe111e
MN
673};\
674\
b3184779
MN
675void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
676 {\
677 OPNAME ## _pixels16,\
678 OPNAME ## _no_rnd_pixels16_x2,\
679 OPNAME ## _no_rnd_pixels16_y2,\
680 OPNAME ## _no_rnd_pixels16_xy2},\
681 {\
682 OPNAME ## _pixels8,\
683 OPNAME ## _no_rnd_pixels8_x2,\
684 OPNAME ## _no_rnd_pixels8_y2,\
685 OPNAME ## _no_rnd_pixels8_xy2},\
59fe111e 686};
b3184779 687
59fe111e
MN
688#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
689#endif
59fe111e
MN
690#define op_put(a, b) a = b
691
692PIXOP2(avg, op_avg)
693PIXOP2(put, op_put)
694#undef op_avg
695#undef op_put
696
57060b1e 697#if 0
59fe111e 698/* FIXME this stuff could be removed as its ot really used anymore */
de6d9b64
FB
699#define PIXOP(BTYPE, OPNAME, OP, INCR) \
700 \
701static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
702{ \
703 BTYPE *p; \
704 const UINT8 *pix; \
705 \
706 p = block; \
707 pix = pixels; \
708 do { \
709 OP(p[0], pix[0]); \
710 OP(p[1], pix[1]); \
711 OP(p[2], pix[2]); \
712 OP(p[3], pix[3]); \
713 OP(p[4], pix[4]); \
714 OP(p[5], pix[5]); \
715 OP(p[6], pix[6]); \
716 OP(p[7], pix[7]); \
717 pix += line_size; \
718 p += INCR; \
719 } while (--h);; \
720} \
721 \
722static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
723{ \
724 BTYPE *p; \
725 const UINT8 *pix; \
726 \
727 p = block; \
728 pix = pixels; \
729 do { \
730 OP(p[0], avg2(pix[0], pix[1])); \
731 OP(p[1], avg2(pix[1], pix[2])); \
732 OP(p[2], avg2(pix[2], pix[3])); \
733 OP(p[3], avg2(pix[3], pix[4])); \
734 OP(p[4], avg2(pix[4], pix[5])); \
735 OP(p[5], avg2(pix[5], pix[6])); \
736 OP(p[6], avg2(pix[6], pix[7])); \
737 OP(p[7], avg2(pix[7], pix[8])); \
738 pix += line_size; \
739 p += INCR; \
740 } while (--h); \
741} \
742 \
743static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
744{ \
745 BTYPE *p; \
746 const UINT8 *pix; \
747 const UINT8 *pix1; \
748 \
749 p = block; \
750 pix = pixels; \
751 pix1 = pixels + line_size; \
752 do { \
753 OP(p[0], avg2(pix[0], pix1[0])); \
754 OP(p[1], avg2(pix[1], pix1[1])); \
755 OP(p[2], avg2(pix[2], pix1[2])); \
756 OP(p[3], avg2(pix[3], pix1[3])); \
757 OP(p[4], avg2(pix[4], pix1[4])); \
758 OP(p[5], avg2(pix[5], pix1[5])); \
759 OP(p[6], avg2(pix[6], pix1[6])); \
760 OP(p[7], avg2(pix[7], pix1[7])); \
761 pix += line_size; \
762 pix1 += line_size; \
763 p += INCR; \
764 } while(--h); \
765} \
766 \
767static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
768{ \
769 BTYPE *p; \
770 const UINT8 *pix; \
771 const UINT8 *pix1; \
772 \
773 p = block; \
774 pix = pixels; \
775 pix1 = pixels + line_size; \
776 do { \
777 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
778 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
779 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
780 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
781 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
782 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
783 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
784 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
785 pix += line_size; \
786 pix1 += line_size; \
787 p += INCR; \
788 } while(--h); \
789} \
790 \
791void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
792 OPNAME ## _pixels, \
793 OPNAME ## _pixels_x2, \
794 OPNAME ## _pixels_y2, \
795 OPNAME ## _pixels_xy2, \
796};
797
de6d9b64
FB
798/* rounding primitives */
799#define avg2(a,b) ((a+b+1)>>1)
800#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
801
de6d9b64
FB
802#define op_avg(a, b) a = avg2(a, b)
803#define op_sub(a, b) a -= b
3aa102be 804#define op_put(a, b) a = b
de6d9b64 805
de6d9b64 806PIXOP(DCTELEM, sub, op_sub, 8)
3aa102be
MN
807PIXOP(uint8_t, avg, op_avg, line_size)
808PIXOP(uint8_t, put, op_put, line_size)
de6d9b64
FB
809
810/* not rounding primitives */
811#undef avg2
812#undef avg4
813#define avg2(a,b) ((a+b)>>1)
814#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
815
3aa102be
MN
816PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
817PIXOP(uint8_t, put_no_rnd, op_put, line_size)
de6d9b64
FB
818/* motion estimation */
819
820#undef avg2
821#undef avg4
57060b1e
FB
822#endif
823
de6d9b64
FB
824#define avg2(a,b) ((a+b+1)>>1)
825#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
826
073b013d 827
b3184779 828static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
829{
830 const int A=(16-x16)*(16-y16);
831 const int B=( x16)*(16-y16);
832 const int C=(16-x16)*( y16);
833 const int D=( x16)*( y16);
834 int i;
44eb4951
MN
835
836 for(i=0; i<h; i++)
837 {
b3184779
MN
838 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
839 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
840 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
841 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
842 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
843 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
844 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
845 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
846 dst+= stride;
847 src+= stride;
44eb4951
MN
848 }
849}
850
073b013d
MN
851static void gmc_c(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy,
852 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
853{
854 int y, vx, vy;
855 const int s= 1<<shift;
856
857 width--;
858 height--;
859
860 for(y=0; y<h; y++){
861 int x;
862
863 vx= ox;
864 vy= oy;
865 for(x=0; x<8; x++){ //XXX FIXME optimize
866 int src_x, src_y, frac_x, frac_y, index;
867
868 src_x= vx>>16;
869 src_y= vy>>16;
870 frac_x= src_x&(s-1);
871 frac_y= src_y&(s-1);
872 src_x>>=shift;
873 src_y>>=shift;
874
875 if((unsigned)src_x < width){
876 if((unsigned)src_y < height){
877 index= src_x + src_y*stride;
878 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
879 + src[index +1]* frac_x )*(s-frac_y)
880 + ( src[index+stride ]*(s-frac_x)
881 + src[index+stride+1]* frac_x )* frac_y
882 + r)>>(shift*2);
883 }else{
884 index= src_x + clip(src_y, 0, height)*stride;
885 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
886 + src[index +1]* frac_x )*s
887 + r)>>(shift*2);
888 }
889 }else{
890 if((unsigned)src_y < height){
891 index= clip(src_x, 0, width) + src_y*stride;
892 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
893 + src[index+stride ]* frac_y )*s
894 + r)>>(shift*2);
895 }else{
896 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
897 dst[y*stride + x]= src[index ];
898 }
899 }
900
901 vx+= dxx;
902 vy+= dyx;
903 }
904 ox += dxy;
905 oy += dyy;
906 }
907}
908
b3184779 909static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951 910{
44eb4951
MN
911 int i;
912 for(i=0; i<h; i++)
913 {
b3184779
MN
914 ST32(dst , LD32(src ));
915 ST32(dst+4 , LD32(src+4 ));
916 ST32(dst+8 , LD32(src+8 ));
917 ST32(dst+12, LD32(src+12));
918 dst[16]= src[16];
44eb4951
MN
919 dst+=dstStride;
920 src+=srcStride;
921 }
922}
923
b3184779 924static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951
MN
925{
926 int i;
b3184779 927 for(i=0; i<h; i++)
44eb4951 928 {
b3184779
MN
929 ST32(dst , LD32(src ));
930 ST32(dst+4 , LD32(src+4 ));
931 dst[8]= src[8];
44eb4951
MN
932 dst+=dstStride;
933 src+=srcStride;
934 }
935}
936
b3184779
MN
937#define QPEL_MC(r, OPNAME, RND, OP) \
938static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
939 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
940 int i;\
941 for(i=0; i<h; i++)\
942 {\
943 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
944 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
945 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
946 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
947 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
948 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
949 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
950 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
951 dst+=dstStride;\
952 src+=srcStride;\
953 }\
44eb4951
MN
954}\
955\
b3184779
MN
956static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
957 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
958 int i;\
959 for(i=0; i<w; i++)\
960 {\
961 const int src0= src[0*srcStride];\
962 const int src1= src[1*srcStride];\
963 const int src2= src[2*srcStride];\
964 const int src3= src[3*srcStride];\
965 const int src4= src[4*srcStride];\
966 const int src5= src[5*srcStride];\
967 const int src6= src[6*srcStride];\
968 const int src7= src[7*srcStride];\
969 const int src8= src[8*srcStride];\
970 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
971 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
972 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
973 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
974 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
975 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
976 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
977 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
978 dst++;\
979 src++;\
980 }\
981}\
982\
983static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
984 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
985 int i;\
986 for(i=0; i<h; i++)\
987 {\
988 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
989 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
990 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
991 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
992 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
993 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
994 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
995 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
996 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
997 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
998 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
999 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1000 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1001 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1002 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1003 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1004 dst+=dstStride;\
1005 src+=srcStride;\
1006 }\
1007}\
1008\
1009static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
1010 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
1011 int i;\
1012 for(i=0; i<w; i++)\
1013 {\
1014 const int src0= src[0*srcStride];\
1015 const int src1= src[1*srcStride];\
1016 const int src2= src[2*srcStride];\
1017 const int src3= src[3*srcStride];\
1018 const int src4= src[4*srcStride];\
1019 const int src5= src[5*srcStride];\
1020 const int src6= src[6*srcStride];\
1021 const int src7= src[7*srcStride];\
1022 const int src8= src[8*srcStride];\
1023 const int src9= src[9*srcStride];\
1024 const int src10= src[10*srcStride];\
1025 const int src11= src[11*srcStride];\
1026 const int src12= src[12*srcStride];\
1027 const int src13= src[13*srcStride];\
1028 const int src14= src[14*srcStride];\
1029 const int src15= src[15*srcStride];\
1030 const int src16= src[16*srcStride];\
1031 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1032 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1033 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1034 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1035 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1036 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1037 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1038 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1039 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1040 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1041 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1042 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1043 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1044 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1045 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1046 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1047 dst++;\
1048 src++;\
1049 }\
1050}\
1051\
1052static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1053 OPNAME ## pixels8(dst, src, stride, 8);\
1054}\
1055\
1056static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 1057 UINT8 half[64];\
b3184779
MN
1058 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1059 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1060}\
1061\
b3184779
MN
1062static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1063 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1064}\
1065\
b3184779 1066static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 1067 UINT8 half[64];\
b3184779
MN
1068 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1069 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1070}\
1071\
b3184779
MN
1072static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1073 UINT8 full[16*9];\
44eb4951 1074 UINT8 half[64];\
b3184779
MN
1075 copy_block9(full, src, 16, stride, 9);\
1076 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1077 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1078}\
1079\
b3184779
MN
1080static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1081 UINT8 full[16*9];\
1082 copy_block9(full, src, 16, stride, 9);\
1083 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
44eb4951
MN
1084}\
1085\
b3184779
MN
1086static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1087 UINT8 full[16*9];\
44eb4951 1088 UINT8 half[64];\
b3184779
MN
1089 copy_block9(full, src, 16, stride, 9);\
1090 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1091 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1092}\
b3184779
MN
1093static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1094 UINT8 full[16*9];\
44eb4951 1095 UINT8 halfH[72];\
7ff037e9 1096 UINT8 halfV[64];\
44eb4951 1097 UINT8 halfHV[64];\
b3184779
MN
1098 copy_block9(full, src, 16, stride, 9);\
1099 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1100 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1101 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1102 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1103}\
b3184779
MN
1104static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1105 UINT8 full[16*9];\
44eb4951 1106 UINT8 halfH[72];\
7ff037e9 1107 UINT8 halfV[64];\
44eb4951 1108 UINT8 halfHV[64];\
b3184779
MN
1109 copy_block9(full, src, 16, stride, 9);\
1110 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1111 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1112 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1113 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1114}\
b3184779
MN
1115static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1116 UINT8 full[16*9];\
44eb4951 1117 UINT8 halfH[72];\
7ff037e9 1118 UINT8 halfV[64];\
44eb4951 1119 UINT8 halfHV[64];\
b3184779
MN
1120 copy_block9(full, src, 16, stride, 9);\
1121 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1122 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1123 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1124 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1125}\
b3184779
MN
1126static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1127 UINT8 full[16*9];\
44eb4951 1128 UINT8 halfH[72];\
7ff037e9 1129 UINT8 halfV[64];\
44eb4951 1130 UINT8 halfHV[64];\
b3184779
MN
1131 copy_block9(full, src, 16, stride, 9);\
1132 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1133 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1134 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1135 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1136}\
b3184779 1137static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
1138 UINT8 halfH[72];\
1139 UINT8 halfHV[64];\
b3184779
MN
1140 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1141 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1142 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1143}\
b3184779 1144static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
1145 UINT8 halfH[72];\
1146 UINT8 halfHV[64];\
b3184779
MN
1147 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1148 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1149 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1150}\
b3184779
MN
1151static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1152 UINT8 full[16*9];\
44eb4951 1153 UINT8 halfH[72];\
7ff037e9 1154 UINT8 halfV[64];\
44eb4951 1155 UINT8 halfHV[64];\
b3184779
MN
1156 copy_block9(full, src, 16, stride, 9);\
1157 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1158 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1159 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1160 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1161}\
b3184779
MN
1162static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1163 UINT8 full[16*9];\
44eb4951 1164 UINT8 halfH[72];\
7ff037e9 1165 UINT8 halfV[64];\
44eb4951 1166 UINT8 halfHV[64];\
b3184779
MN
1167 copy_block9(full, src, 16, stride, 9);\
1168 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1169 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1170 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1171 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1172}\
b3184779 1173static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 1174 UINT8 halfH[72];\
b3184779
MN
1175 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1176 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
1177}\
1178static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1179 OPNAME ## pixels16(dst, src, stride, 16);\
1180}\
1181\
1182static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1183 UINT8 half[256];\
1184 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1185 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1186}\
1187\
1188static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1189 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1190}\
b3184779
MN
1191\
1192static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1193 UINT8 half[256];\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1195 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1196}\
1197\
1198static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1199 UINT8 full[24*17];\
1200 UINT8 half[256];\
1201 copy_block17(full, src, 24, stride, 17);\
1202 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1203 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1204}\
1205\
1206static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1207 UINT8 full[24*17];\
1208 copy_block17(full, src, 24, stride, 17);\
1209 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1210}\
1211\
1212static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1213 UINT8 full[24*17];\
1214 UINT8 half[256];\
1215 copy_block17(full, src, 24, stride, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1217 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1218}\
1219static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1220 UINT8 full[24*17];\
1221 UINT8 halfH[272];\
1222 UINT8 halfV[256];\
1223 UINT8 halfHV[256];\
1224 copy_block17(full, src, 24, stride, 17);\
1225 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1228 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1229}\
1230static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1231 UINT8 full[24*17];\
1232 UINT8 halfH[272];\
1233 UINT8 halfV[256];\
1234 UINT8 halfHV[256];\
1235 copy_block17(full, src, 24, stride, 17);\
1236 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1238 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1239 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1240}\
1241static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1242 UINT8 full[24*17];\
1243 UINT8 halfH[272];\
1244 UINT8 halfV[256];\
1245 UINT8 halfHV[256];\
1246 copy_block17(full, src, 24, stride, 17);\
1247 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1248 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1249 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1250 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1251}\
1252static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1253 UINT8 full[24*17];\
1254 UINT8 halfH[272];\
1255 UINT8 halfV[256];\
1256 UINT8 halfHV[256];\
1257 copy_block17(full, src, 24, stride, 17);\
1258 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1259 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1260 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1261 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1262}\
1263static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1264 UINT8 halfH[272];\
1265 UINT8 halfHV[256];\
1266 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1267 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1268 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1269}\
1270static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1271 UINT8 halfH[272];\
1272 UINT8 halfHV[256];\
1273 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1274 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1275 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1276}\
1277static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1278 UINT8 full[24*17];\
1279 UINT8 halfH[272];\
1280 UINT8 halfV[256];\
1281 UINT8 halfHV[256];\
1282 copy_block17(full, src, 24, stride, 17);\
1283 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1284 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1285 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1286 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1287}\
1288static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1289 UINT8 full[24*17];\
1290 UINT8 halfH[272];\
1291 UINT8 halfV[256];\
1292 UINT8 halfHV[256];\
1293 copy_block17(full, src, 24, stride, 17);\
1294 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1295 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1296 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1297 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1298}\
1299static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1300 UINT8 halfH[272];\
1301 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1302 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
1303}\
1304qpel_mc_func OPNAME ## qpel_pixels_tab[2][16]={ \
1305 {\
1306 OPNAME ## qpel16_mc00_c, \
1307 OPNAME ## qpel16_mc10_c, \
1308 OPNAME ## qpel16_mc20_c, \
1309 OPNAME ## qpel16_mc30_c, \
1310 OPNAME ## qpel16_mc01_c, \
1311 OPNAME ## qpel16_mc11_c, \
1312 OPNAME ## qpel16_mc21_c, \
1313 OPNAME ## qpel16_mc31_c, \
1314 OPNAME ## qpel16_mc02_c, \
1315 OPNAME ## qpel16_mc12_c, \
1316 OPNAME ## qpel16_mc22_c, \
1317 OPNAME ## qpel16_mc32_c, \
1318 OPNAME ## qpel16_mc03_c, \
1319 OPNAME ## qpel16_mc13_c, \
1320 OPNAME ## qpel16_mc23_c, \
1321 OPNAME ## qpel16_mc33_c, \
1322 },{\
1323 OPNAME ## qpel8_mc00_c, \
1324 OPNAME ## qpel8_mc10_c, \
1325 OPNAME ## qpel8_mc20_c, \
1326 OPNAME ## qpel8_mc30_c, \
1327 OPNAME ## qpel8_mc01_c, \
1328 OPNAME ## qpel8_mc11_c, \
1329 OPNAME ## qpel8_mc21_c, \
1330 OPNAME ## qpel8_mc31_c, \
1331 OPNAME ## qpel8_mc02_c, \
1332 OPNAME ## qpel8_mc12_c, \
1333 OPNAME ## qpel8_mc22_c, \
1334 OPNAME ## qpel8_mc32_c, \
1335 OPNAME ## qpel8_mc03_c, \
1336 OPNAME ## qpel8_mc13_c, \
1337 OPNAME ## qpel8_mc23_c, \
1338 OPNAME ## qpel8_mc33_c, \
1339 }\
44eb4951
MN
1340};
1341
b3184779
MN
1342#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1343#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1344#define op_put(a, b) a = cm[((b) + 16)>>5]
1345#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1346
1347QPEL_MC(0, put_ , _ , op_put)
1348QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1349QPEL_MC(0, avg_ , _ , op_avg)
1350//QPEL_MC(1, avg_no_rnd , _ , op_avg)
1351#undef op_avg
1352#undef op_avg_no_rnd
1353#undef op_put
1354#undef op_put_no_rnd
44eb4951 1355
ba6802de 1356int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1357{
1358 int s, i;
1359
1360 s = 0;
ba6802de 1361 for(i=0;i<16;i++) {
de6d9b64
FB
1362 s += abs(pix1[0] - pix2[0]);
1363 s += abs(pix1[1] - pix2[1]);
1364 s += abs(pix1[2] - pix2[2]);
1365 s += abs(pix1[3] - pix2[3]);
1366 s += abs(pix1[4] - pix2[4]);
1367 s += abs(pix1[5] - pix2[5]);
1368 s += abs(pix1[6] - pix2[6]);
1369 s += abs(pix1[7] - pix2[7]);
1370 s += abs(pix1[8] - pix2[8]);
1371 s += abs(pix1[9] - pix2[9]);
1372 s += abs(pix1[10] - pix2[10]);
1373 s += abs(pix1[11] - pix2[11]);
1374 s += abs(pix1[12] - pix2[12]);
1375 s += abs(pix1[13] - pix2[13]);
1376 s += abs(pix1[14] - pix2[14]);
1377 s += abs(pix1[15] - pix2[15]);
1378 pix1 += line_size;
1379 pix2 += line_size;
1380 }
1381 return s;
1382}
1383
ba6802de 1384int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1385{
1386 int s, i;
1387
1388 s = 0;
ba6802de 1389 for(i=0;i<16;i++) {
de6d9b64
FB
1390 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1391 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1392 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1393 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1394 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1395 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1396 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1397 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1398 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1399 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1400 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1401 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1402 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1403 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1404 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1405 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1406 pix1 += line_size;
1407 pix2 += line_size;
1408 }
1409 return s;
1410}
1411
ba6802de 1412int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1413{
1414 int s, i;
1415 UINT8 *pix3 = pix2 + line_size;
1416
1417 s = 0;
ba6802de 1418 for(i=0;i<16;i++) {
de6d9b64
FB
1419 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1420 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1421 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1422 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1423 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1424 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1425 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1426 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1427 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1428 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1429 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1430 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1431 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1432 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1433 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1434 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1435 pix1 += line_size;
1436 pix2 += line_size;
1437 pix3 += line_size;
1438 }
1439 return s;
1440}
1441
ba6802de 1442int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1443{
1444 int s, i;
1445 UINT8 *pix3 = pix2 + line_size;
1446
1447 s = 0;
ba6802de 1448 for(i=0;i<16;i++) {
de6d9b64
FB
1449 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1450 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1451 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1452 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1453 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1454 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1455 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1456 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1457 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1458 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1459 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1460 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1461 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1462 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1463 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1464 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1465 pix1 += line_size;
1466 pix2 += line_size;
1467 pix3 += line_size;
1468 }
1469 return s;
1470}
1471
ba6802de
MN
1472int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1473{
1474 int s, i;
1475
1476 s = 0;
1477 for(i=0;i<8;i++) {
1478 s += abs(pix1[0] - pix2[0]);
1479 s += abs(pix1[1] - pix2[1]);
1480 s += abs(pix1[2] - pix2[2]);
1481 s += abs(pix1[3] - pix2[3]);
1482 s += abs(pix1[4] - pix2[4]);
1483 s += abs(pix1[5] - pix2[5]);
1484 s += abs(pix1[6] - pix2[6]);
1485 s += abs(pix1[7] - pix2[7]);
1486 pix1 += line_size;
1487 pix2 += line_size;
1488 }
1489 return s;
1490}
1491
1492int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1493{
1494 int s, i;
1495
1496 s = 0;
1497 for(i=0;i<8;i++) {
1498 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1499 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1500 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1501 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1502 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1503 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1504 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1505 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1506 pix1 += line_size;
1507 pix2 += line_size;
1508 }
1509 return s;
1510}
1511
1512int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1513{
1514 int s, i;
1515 UINT8 *pix3 = pix2 + line_size;
1516
1517 s = 0;
1518 for(i=0;i<8;i++) {
1519 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1520 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1521 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1522 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1523 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1524 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1525 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1526 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1527 pix1 += line_size;
1528 pix2 += line_size;
1529 pix3 += line_size;
1530 }
1531 return s;
1532}
1533
1534int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1535{
1536 int s, i;
1537 UINT8 *pix3 = pix2 + line_size;
1538
1539 s = 0;
1540 for(i=0;i<8;i++) {
1541 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1542 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1543 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1544 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1545 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1546 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1547 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1548 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1549 pix1 += line_size;
1550 pix2 += line_size;
1551 pix3 += line_size;
1552 }
1553 return s;
1554}
1555
7801d21d 1556void ff_block_permute(INT16 *block, UINT8 *permutation, const UINT8 *scantable, int last)
d962f6fd 1557{
7801d21d
MN
1558 int i;
1559 INT16 temp[64];
1560
1561 if(last<=0) return;
1562 if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 1563
7801d21d
MN
1564 for(i=0; i<=last; i++){
1565 const int j= scantable[i];
1566 temp[j]= block[j];
1567 block[j]=0;
1568 }
1569
1570 for(i=0; i<=last; i++){
1571 const int j= scantable[i];
1572 const int perm_j= permutation[j];
1573 block[perm_j]= temp[j];
1574 }
d962f6fd 1575}
e0eac44e 1576
649c00c9
MN
1577void clear_blocks_c(DCTELEM *blocks)
1578{
1579 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1580}
1581
e0eac44e
FB
1582void dsputil_init(void)
1583{
d2975f8d 1584 int i;
e0eac44e 1585
de6d9b64
FB
1586 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1587 for(i=0;i<MAX_NEG_CROP;i++) {
1588 cropTbl[i] = 0;
1589 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1590 }
1591
1592 for(i=0;i<512;i++) {
1593 squareTbl[i] = (i - 256) * (i - 256);
1594 }
1595
1596 get_pixels = get_pixels_c;
9dbcbd92 1597 diff_pixels = diff_pixels_c;
de6d9b64
FB
1598 put_pixels_clamped = put_pixels_clamped_c;
1599 add_pixels_clamped = add_pixels_clamped_c;
073b013d
MN
1600 ff_gmc1= gmc1_c;
1601 ff_gmc= gmc_c;
649c00c9 1602 clear_blocks= clear_blocks_c;
3aa102be
MN
1603 pix_sum= pix_sum_c;
1604 pix_norm1= pix_norm1_c;
de6d9b64 1605
ba6802de
MN
1606 pix_abs16x16 = pix_abs16x16_c;
1607 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1608 pix_abs16x16_y2 = pix_abs16x16_y2_c;
de6d9b64 1609 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
ba6802de
MN
1610 pix_abs8x8 = pix_abs8x8_c;
1611 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1612 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1613 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
de6d9b64 1614
980fc7b8 1615#ifdef HAVE_MMX
de6d9b64
FB
1616 dsputil_init_mmx();
1617#endif
3d03c0a2
FB
1618#ifdef ARCH_ARMV4L
1619 dsputil_init_armv4l();
1620#endif
c34270f5
FB
1621#ifdef HAVE_MLIB
1622 dsputil_init_mlib();
c34270f5 1623#endif
1e98dffb
NK
1624#ifdef ARCH_ALPHA
1625 dsputil_init_alpha();
1e98dffb 1626#endif
59925ef2 1627#ifdef ARCH_POWERPC
ab6c65f6 1628 dsputil_init_ppc();
a43bd1d7 1629#endif
d46aba26
LS
1630#ifdef HAVE_MMI
1631 dsputil_init_mmi();
d46aba26 1632#endif
c34270f5 1633
2ad1516a 1634 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
de6d9b64 1635}
43f1708f 1636
57060b1e
FB
1637/* remove any non bit exact operation (testing purpose) */
1638void avcodec_set_bit_exact(void)
1639{
5596c60c 1640 ff_bit_exact=1;
57060b1e
FB
1641#ifdef HAVE_MMX
1642 dsputil_set_bit_exact_mmx();
1643#endif
1644}
1645
43f1708f
J
1646void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1647 int orig_linesize[3], int coded_linesize,
1648 AVCodecContext *avctx)
1649{
1650 int quad, diff, x, y;
1651 UINT8 *orig, *coded;
1652 UINT32 *sq = squareTbl + 256;
1653
1654 quad = 0;
1655 diff = 0;
1656
1657 /* Luminance */
1658 orig = orig_image[0];
1659 coded = coded_image[0];
1660
1661 for (y=0;y<avctx->height;y++) {
1662 for (x=0;x<avctx->width;x++) {
1663 diff = *(orig + x) - *(coded + x);
1664 quad += sq[diff];
1665 }
1666 orig += orig_linesize[0];
1667 coded += coded_linesize;
1668 }
1669
1670 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1671
1672 if (avctx->psnr_y) {
1673 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1674 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1675 } else
1676 avctx->psnr_y = 99.99;
1677}
1678