fixing alt_scan for the first frame (variable was reset)
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
23
de6d9b64 24void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
9dbcbd92 25void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
de6d9b64
FB
26void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
27void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
44eb4951 28void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
649c00c9 29void (*clear_blocks)(DCTELEM *blocks);
3aa102be
MN
30int (*pix_sum)(UINT8 * pix, int line_size);
31int (*pix_norm1)(UINT8 * pix, int line_size);
de6d9b64
FB
32
33op_pixels_abs_func pix_abs16x16;
34op_pixels_abs_func pix_abs16x16_x2;
35op_pixels_abs_func pix_abs16x16_y2;
36op_pixels_abs_func pix_abs16x16_xy2;
37
ba6802de
MN
38op_pixels_abs_func pix_abs8x8;
39op_pixels_abs_func pix_abs8x8_x2;
40op_pixels_abs_func pix_abs8x8_y2;
41op_pixels_abs_func pix_abs8x8_xy2;
42
5596c60c
MN
43int ff_bit_exact=0;
44
0cfa9713 45UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
46UINT32 squareTbl[512];
47
2ad1516a
MN
48const UINT8 ff_zigzag_direct[64] = {
49 0, 1, 8, 16, 9, 2, 3, 10,
50 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 51 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 52 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
53 35, 42, 49, 56, 57, 50, 43, 36,
54 29, 22, 15, 23, 30, 37, 44, 51,
55 58, 59, 52, 45, 38, 31, 39, 46,
56 53, 60, 61, 54, 47, 55, 62, 63
57};
58
2f349de2
MN
59/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
60UINT16 __align8 inv_zigzag_direct16[64];
61
2ad1516a
MN
62const UINT8 ff_alternate_horizontal_scan[64] = {
63 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
64 10, 11, 4, 5, 6, 7, 15, 14,
65 13, 12, 19, 18, 24, 25, 32, 33,
66 26, 27, 20, 21, 22, 23, 28, 29,
67 30, 31, 34, 35, 40, 41, 48, 49,
68 42, 43, 36, 37, 38, 39, 44, 45,
69 46, 47, 50, 51, 56, 57, 58, 59,
70 52, 53, 54, 55, 60, 61, 62, 63,
71};
72
2ad1516a
MN
73const UINT8 ff_alternate_vertical_scan[64] = {
74 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
75 17, 25, 32, 40, 48, 56, 57, 49,
76 41, 33, 26, 18, 3, 11, 4, 12,
77 19, 27, 34, 42, 50, 58, 35, 43,
78 51, 59, 20, 28, 5, 13, 6, 14,
79 21, 29, 36, 44, 52, 60, 37, 45,
80 53, 61, 22, 30, 7, 15, 23, 31,
81 38, 46, 54, 62, 39, 47, 55, 63,
82};
83
2f349de2
MN
84/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
85UINT32 inverse[256]={
86 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
87 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
88 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
89 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
90 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
91 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
92 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
93 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
94 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
95 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
96 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
97 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
98 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
99 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
100 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
101 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
102 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
103 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
104 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
105 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
106 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
107 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
108 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
109 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
110 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
111 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
112 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
113 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
114 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
115 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
116 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
117 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
118};
119
3aa102be
MN
120int pix_sum_c(UINT8 * pix, int line_size)
121{
122 int s, i, j;
123
124 s = 0;
125 for (i = 0; i < 16; i++) {
126 for (j = 0; j < 16; j += 8) {
127 s += pix[0];
128 s += pix[1];
129 s += pix[2];
130 s += pix[3];
131 s += pix[4];
132 s += pix[5];
133 s += pix[6];
134 s += pix[7];
135 pix += 8;
136 }
137 pix += line_size - 16;
138 }
139 return s;
140}
141
142int pix_norm1_c(UINT8 * pix, int line_size)
143{
144 int s, i, j;
145 UINT32 *sq = squareTbl + 256;
146
147 s = 0;
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
150 s += sq[pix[0]];
151 s += sq[pix[1]];
152 s += sq[pix[2]];
153 s += sq[pix[3]];
154 s += sq[pix[4]];
155 s += sq[pix[5]];
156 s += sq[pix[6]];
157 s += sq[pix[7]];
158 pix += 8;
159 }
160 pix += line_size - 16;
161 }
162 return s;
163}
164
165
c13e1abd 166void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
de6d9b64 167{
de6d9b64
FB
168 int i;
169
170 /* read the pixels */
de6d9b64 171 for(i=0;i<8;i++) {
c13e1abd
FH
172 block[0] = pixels[0];
173 block[1] = pixels[1];
174 block[2] = pixels[2];
175 block[3] = pixels[3];
176 block[4] = pixels[4];
177 block[5] = pixels[5];
178 block[6] = pixels[6];
179 block[7] = pixels[7];
180 pixels += line_size;
181 block += 8;
de6d9b64
FB
182 }
183}
184
c13e1abd
FH
185void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
186 int stride){
9dbcbd92
MN
187 int i;
188
189 /* read the pixels */
9dbcbd92 190 for(i=0;i<8;i++) {
c13e1abd
FH
191 block[0] = s1[0] - s2[0];
192 block[1] = s1[1] - s2[1];
193 block[2] = s1[2] - s2[2];
194 block[3] = s1[3] - s2[3];
195 block[4] = s1[4] - s2[4];
196 block[5] = s1[5] - s2[5];
197 block[6] = s1[6] - s2[6];
198 block[7] = s1[7] - s2[7];
9dbcbd92
MN
199 s1 += stride;
200 s2 += stride;
c13e1abd 201 block += 8;
9dbcbd92
MN
202 }
203}
204
205
c13e1abd
FH
206void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
207 int line_size)
de6d9b64 208{
de6d9b64
FB
209 int i;
210 UINT8 *cm = cropTbl + MAX_NEG_CROP;
211
212 /* read the pixels */
de6d9b64 213 for(i=0;i<8;i++) {
c13e1abd
FH
214 pixels[0] = cm[block[0]];
215 pixels[1] = cm[block[1]];
216 pixels[2] = cm[block[2]];
217 pixels[3] = cm[block[3]];
218 pixels[4] = cm[block[4]];
219 pixels[5] = cm[block[5]];
220 pixels[6] = cm[block[6]];
221 pixels[7] = cm[block[7]];
222
223 pixels += line_size;
224 block += 8;
de6d9b64
FB
225 }
226}
227
c13e1abd
FH
228void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
229 int line_size)
de6d9b64 230{
de6d9b64
FB
231 int i;
232 UINT8 *cm = cropTbl + MAX_NEG_CROP;
233
234 /* read the pixels */
de6d9b64 235 for(i=0;i<8;i++) {
c13e1abd
FH
236 pixels[0] = cm[pixels[0] + block[0]];
237 pixels[1] = cm[pixels[1] + block[1]];
238 pixels[2] = cm[pixels[2] + block[2]];
239 pixels[3] = cm[pixels[3] + block[3]];
240 pixels[4] = cm[pixels[4] + block[4]];
241 pixels[5] = cm[pixels[5] + block[5]];
242 pixels[6] = cm[pixels[6] + block[6]];
243 pixels[7] = cm[pixels[7] + block[7]];
244 pixels += line_size;
245 block += 8;
de6d9b64
FB
246 }
247}
59fe111e
MN
248#if 0
249
250#define PIXOP2(OPNAME, OP) \
b3184779 251static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
252{\
253 int i;\
254 for(i=0; i<h; i++){\
255 OP(*((uint64_t*)block), LD64(pixels));\
256 pixels+=line_size;\
257 block +=line_size;\
258 }\
259}\
260\
b3184779 261static void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
262{\
263 int i;\
264 for(i=0; i<h; i++){\
265 const uint64_t a= LD64(pixels );\
266 const uint64_t b= LD64(pixels+1);\
267 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
268 pixels+=line_size;\
269 block +=line_size;\
270 }\
271}\
272\
b3184779 273static void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
274{\
275 int i;\
276 for(i=0; i<h; i++){\
277 const uint64_t a= LD64(pixels );\
278 const uint64_t b= LD64(pixels+1);\
279 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
280 pixels+=line_size;\
281 block +=line_size;\
282 }\
283}\
284\
b3184779 285static void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
286{\
287 int i;\
288 for(i=0; i<h; i++){\
289 const uint64_t a= LD64(pixels );\
290 const uint64_t b= LD64(pixels+line_size);\
291 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
292 pixels+=line_size;\
293 block +=line_size;\
294 }\
295}\
296\
b3184779 297static void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
298{\
299 int i;\
300 for(i=0; i<h; i++){\
301 const uint64_t a= LD64(pixels );\
302 const uint64_t b= LD64(pixels+line_size);\
303 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
304 pixels+=line_size;\
305 block +=line_size;\
306 }\
307}\
308\
b3184779 309static void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
310{\
311 int i;\
312 const uint64_t a= LD64(pixels );\
313 const uint64_t b= LD64(pixels+1);\
314 uint64_t l0= (a&0x0303030303030303ULL)\
315 + (b&0x0303030303030303ULL)\
316 + 0x0202020202020202ULL;\
317 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
318 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
319 uint64_t l1,h1;\
320\
321 pixels+=line_size;\
322 for(i=0; i<h; i+=2){\
323 uint64_t a= LD64(pixels );\
324 uint64_t b= LD64(pixels+1);\
325 l1= (a&0x0303030303030303ULL)\
326 + (b&0x0303030303030303ULL);\
327 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
328 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
329 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
330 pixels+=line_size;\
331 block +=line_size;\
332 a= LD64(pixels );\
333 b= LD64(pixels+1);\
334 l0= (a&0x0303030303030303ULL)\
335 + (b&0x0303030303030303ULL)\
336 + 0x0202020202020202ULL;\
337 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
338 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
339 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
340 pixels+=line_size;\
341 block +=line_size;\
342 }\
343}\
344\
b3184779 345static void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
346{\
347 int i;\
348 const uint64_t a= LD64(pixels );\
349 const uint64_t b= LD64(pixels+1);\
350 uint64_t l0= (a&0x0303030303030303ULL)\
351 + (b&0x0303030303030303ULL)\
352 + 0x0101010101010101ULL;\
353 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
354 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
355 uint64_t l1,h1;\
356\
357 pixels+=line_size;\
358 for(i=0; i<h; i+=2){\
359 uint64_t a= LD64(pixels );\
360 uint64_t b= LD64(pixels+1);\
361 l1= (a&0x0303030303030303ULL)\
362 + (b&0x0303030303030303ULL);\
363 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
364 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
365 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
366 pixels+=line_size;\
367 block +=line_size;\
368 a= LD64(pixels );\
369 b= LD64(pixels+1);\
370 l0= (a&0x0303030303030303ULL)\
371 + (b&0x0303030303030303ULL)\
372 + 0x0101010101010101ULL;\
373 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
374 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
375 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
376 pixels+=line_size;\
377 block +=line_size;\
378 }\
379}\
380\
b3184779
MN
381CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels , 8)\
382CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels_x2 , 8)\
383CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels_y2 , 8)\
384CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels_xy2, 8)\
385CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels_x2 , 8)\
386CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels_y2 , 8)\
387CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels_xy2, 8)\
388\
389void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
390 {\
391 OPNAME ## _pixels,\
392 OPNAME ## _pixels_x2,\
393 OPNAME ## _pixels_y2,\
394 OPNAME ## _pixels_xy2},\
395 {\
396 OPNAME ## _pixels16,\
397 OPNAME ## _pixels16_x2,\
398 OPNAME ## _pixels16_y2,\
399 OPNAME ## _pixels16_xy2}\
59fe111e
MN
400};\
401\
b3184779
MN
402void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
403 {\
404 OPNAME ## _pixels,\
405 OPNAME ## _no_rnd_pixels_x2,\
406 OPNAME ## _no_rnd_pixels_y2,\
407 OPNAME ## _no_rnd_pixels_xy2},\
408 {\
409 OPNAME ## _pixels16,\
410 OPNAME ## _no_rnd_pixels16_x2,\
411 OPNAME ## _no_rnd_pixels16_y2,\
412 OPNAME ## _no_rnd_pixels16_xy2}\
59fe111e
MN
413};
414
415#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
416#else // 64 bit variant
417
418#define PIXOP2(OPNAME, OP) \
b3184779 419static void OPNAME ## _pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
420 int i;\
421 for(i=0; i<h; i++){\
422 OP(*((uint32_t*)(block )), LD32(pixels ));\
423 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
424 pixels+=line_size;\
425 block +=line_size;\
426 }\
427}\
b3184779
MN
428static inline void OPNAME ## _no_rnd_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
429 OPNAME ## _pixels8(block, pixels, line_size, h);\
430}\
59fe111e 431\
b3184779
MN
432static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
433 int src_stride1, int src_stride2, int h){\
59fe111e
MN
434 int i;\
435 for(i=0; i<h; i++){\
b3184779
MN
436 uint32_t a,b;\
437 a= LD32(&src1[i*src_stride1 ]);\
438 b= LD32(&src2[i*src_stride2 ]);\
439 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
440 a= LD32(&src1[i*src_stride1+4]);\
441 b= LD32(&src2[i*src_stride2+4]);\
442 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
443 }\
444}\
445\
b3184779
MN
446static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
447 int src_stride1, int src_stride2, int h){\
59fe111e
MN
448 int i;\
449 for(i=0; i<h; i++){\
b3184779
MN
450 uint32_t a,b;\
451 a= LD32(&src1[i*src_stride1 ]);\
452 b= LD32(&src2[i*src_stride2 ]);\
453 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
454 a= LD32(&src1[i*src_stride1+4]);\
455 b= LD32(&src2[i*src_stride2+4]);\
456 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
457 }\
458}\
459\
b3184779
MN
460static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
461 int src_stride1, int src_stride2, int h){\
462 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
463 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
464}\
465\
466static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
467 int src_stride1, int src_stride2, int h){\
468 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
469 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
470}\
471\
472static inline void OPNAME ## _no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
473 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
474}\
475\
476static inline void OPNAME ## _pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
477 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
478}\
479\
480static inline void OPNAME ## _no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
481 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
482}\
483\
484static inline void OPNAME ## _pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
485 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
486}\
487\
488static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
489 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
490 int i;\
491 for(i=0; i<h; i++){\
b3184779
MN
492 uint32_t a, b, c, d, l0, l1, h0, h1;\
493 a= LD32(&src1[i*src_stride1]);\
494 b= LD32(&src2[i*src_stride2]);\
495 c= LD32(&src3[i*src_stride3]);\
496 d= LD32(&src4[i*src_stride4]);\
497 l0= (a&0x03030303UL)\
498 + (b&0x03030303UL)\
499 + 0x02020202UL;\
500 h0= ((a&0xFCFCFCFCUL)>>2)\
501 + ((b&0xFCFCFCFCUL)>>2);\
502 l1= (c&0x03030303UL)\
503 + (d&0x03030303UL);\
504 h1= ((c&0xFCFCFCFCUL)>>2)\
505 + ((d&0xFCFCFCFCUL)>>2);\
506 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
507 a= LD32(&src1[i*src_stride1+4]);\
508 b= LD32(&src2[i*src_stride2+4]);\
509 c= LD32(&src3[i*src_stride3+4]);\
510 d= LD32(&src4[i*src_stride4+4]);\
511 l0= (a&0x03030303UL)\
512 + (b&0x03030303UL)\
513 + 0x02020202UL;\
514 h0= ((a&0xFCFCFCFCUL)>>2)\
515 + ((b&0xFCFCFCFCUL)>>2);\
516 l1= (c&0x03030303UL)\
517 + (d&0x03030303UL);\
518 h1= ((c&0xFCFCFCFCUL)>>2)\
519 + ((d&0xFCFCFCFCUL)>>2);\
520 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
521 }\
522}\
b3184779
MN
523static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
524 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
525 int i;\
526 for(i=0; i<h; i++){\
b3184779
MN
527 uint32_t a, b, c, d, l0, l1, h0, h1;\
528 a= LD32(&src1[i*src_stride1]);\
529 b= LD32(&src2[i*src_stride2]);\
530 c= LD32(&src3[i*src_stride3]);\
531 d= LD32(&src4[i*src_stride4]);\
532 l0= (a&0x03030303UL)\
533 + (b&0x03030303UL)\
534 + 0x01010101UL;\
535 h0= ((a&0xFCFCFCFCUL)>>2)\
536 + ((b&0xFCFCFCFCUL)>>2);\
537 l1= (c&0x03030303UL)\
538 + (d&0x03030303UL);\
539 h1= ((c&0xFCFCFCFCUL)>>2)\
540 + ((d&0xFCFCFCFCUL)>>2);\
541 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
542 a= LD32(&src1[i*src_stride1+4]);\
543 b= LD32(&src2[i*src_stride2+4]);\
544 c= LD32(&src3[i*src_stride3+4]);\
545 d= LD32(&src4[i*src_stride4+4]);\
546 l0= (a&0x03030303UL)\
547 + (b&0x03030303UL)\
548 + 0x01010101UL;\
549 h0= ((a&0xFCFCFCFCUL)>>2)\
550 + ((b&0xFCFCFCFCUL)>>2);\
551 l1= (c&0x03030303UL)\
552 + (d&0x03030303UL);\
553 h1= ((c&0xFCFCFCFCUL)>>2)\
554 + ((d&0xFCFCFCFCUL)>>2);\
555 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
556 }\
557}\
b3184779
MN
558static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
559 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
560 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
561 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
562}\
563static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
564 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
565 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
566 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
567}\
59fe111e 568\
b3184779 569static inline void OPNAME ## _pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
570{\
571 int j;\
572 for(j=0; j<2; j++){\
573 int i;\
574 const uint32_t a= LD32(pixels );\
575 const uint32_t b= LD32(pixels+1);\
576 uint32_t l0= (a&0x03030303UL)\
577 + (b&0x03030303UL)\
578 + 0x02020202UL;\
579 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
580 + ((b&0xFCFCFCFCUL)>>2);\
581 uint32_t l1,h1;\
582\
583 pixels+=line_size;\
584 for(i=0; i<h; i+=2){\
585 uint32_t a= LD32(pixels );\
586 uint32_t b= LD32(pixels+1);\
587 l1= (a&0x03030303UL)\
588 + (b&0x03030303UL);\
589 h1= ((a&0xFCFCFCFCUL)>>2)\
590 + ((b&0xFCFCFCFCUL)>>2);\
591 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
592 pixels+=line_size;\
593 block +=line_size;\
594 a= LD32(pixels );\
595 b= LD32(pixels+1);\
596 l0= (a&0x03030303UL)\
597 + (b&0x03030303UL)\
598 + 0x02020202UL;\
599 h0= ((a&0xFCFCFCFCUL)>>2)\
600 + ((b&0xFCFCFCFCUL)>>2);\
601 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
602 pixels+=line_size;\
603 block +=line_size;\
604 }\
605 pixels+=4-line_size*(h+1);\
606 block +=4-line_size*h;\
607 }\
608}\
609\
b3184779 610static inline void OPNAME ## _no_rnd_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
611{\
612 int j;\
613 for(j=0; j<2; j++){\
614 int i;\
615 const uint32_t a= LD32(pixels );\
616 const uint32_t b= LD32(pixels+1);\
617 uint32_t l0= (a&0x03030303UL)\
618 + (b&0x03030303UL)\
619 + 0x01010101UL;\
620 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
621 + ((b&0xFCFCFCFCUL)>>2);\
622 uint32_t l1,h1;\
623\
624 pixels+=line_size;\
625 for(i=0; i<h; i+=2){\
626 uint32_t a= LD32(pixels );\
627 uint32_t b= LD32(pixels+1);\
628 l1= (a&0x03030303UL)\
629 + (b&0x03030303UL);\
630 h1= ((a&0xFCFCFCFCUL)>>2)\
631 + ((b&0xFCFCFCFCUL)>>2);\
632 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
633 pixels+=line_size;\
634 block +=line_size;\
635 a= LD32(pixels );\
636 b= LD32(pixels+1);\
637 l0= (a&0x03030303UL)\
638 + (b&0x03030303UL)\
639 + 0x01010101UL;\
640 h0= ((a&0xFCFCFCFCUL)>>2)\
641 + ((b&0xFCFCFCFCUL)>>2);\
642 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
643 pixels+=line_size;\
644 block +=line_size;\
645 }\
646 pixels+=4-line_size*(h+1);\
647 block +=4-line_size*h;\
648 }\
649}\
650\
b3184779
MN
651CALL_2X_PIXELS(OPNAME ## _pixels16 , OPNAME ## _pixels8 , 8)\
652CALL_2X_PIXELS(OPNAME ## _pixels16_x2 , OPNAME ## _pixels8_x2 , 8)\
653CALL_2X_PIXELS(OPNAME ## _pixels16_y2 , OPNAME ## _pixels8_y2 , 8)\
654CALL_2X_PIXELS(OPNAME ## _pixels16_xy2, OPNAME ## _pixels8_xy2, 8)\
655CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16 , OPNAME ## _pixels8 , 8)\
656CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2 , OPNAME ## _no_rnd_pixels8_x2 , 8)\
657CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2 , OPNAME ## _no_rnd_pixels8_y2 , 8)\
658CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2, OPNAME ## _no_rnd_pixels8_xy2, 8)\
659\
660void (*OPNAME ## _pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
661 {\
662 OPNAME ## _pixels16,\
663 OPNAME ## _pixels16_x2,\
664 OPNAME ## _pixels16_y2,\
665 OPNAME ## _pixels16_xy2},\
666 {\
667 OPNAME ## _pixels8,\
668 OPNAME ## _pixels8_x2,\
669 OPNAME ## _pixels8_y2,\
670 OPNAME ## _pixels8_xy2},\
59fe111e
MN
671};\
672\
b3184779
MN
673void (*OPNAME ## _no_rnd_pixels_tab[2][4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
674 {\
675 OPNAME ## _pixels16,\
676 OPNAME ## _no_rnd_pixels16_x2,\
677 OPNAME ## _no_rnd_pixels16_y2,\
678 OPNAME ## _no_rnd_pixels16_xy2},\
679 {\
680 OPNAME ## _pixels8,\
681 OPNAME ## _no_rnd_pixels8_x2,\
682 OPNAME ## _no_rnd_pixels8_y2,\
683 OPNAME ## _no_rnd_pixels8_xy2},\
59fe111e 684};
b3184779 685
59fe111e
MN
686#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
687#endif
59fe111e
MN
688#define op_put(a, b) a = b
689
690PIXOP2(avg, op_avg)
691PIXOP2(put, op_put)
692#undef op_avg
693#undef op_put
694
57060b1e 695#if 0
59fe111e 696/* FIXME this stuff could be removed as its ot really used anymore */
de6d9b64
FB
697#define PIXOP(BTYPE, OPNAME, OP, INCR) \
698 \
699static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
700{ \
701 BTYPE *p; \
702 const UINT8 *pix; \
703 \
704 p = block; \
705 pix = pixels; \
706 do { \
707 OP(p[0], pix[0]); \
708 OP(p[1], pix[1]); \
709 OP(p[2], pix[2]); \
710 OP(p[3], pix[3]); \
711 OP(p[4], pix[4]); \
712 OP(p[5], pix[5]); \
713 OP(p[6], pix[6]); \
714 OP(p[7], pix[7]); \
715 pix += line_size; \
716 p += INCR; \
717 } while (--h);; \
718} \
719 \
720static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
721{ \
722 BTYPE *p; \
723 const UINT8 *pix; \
724 \
725 p = block; \
726 pix = pixels; \
727 do { \
728 OP(p[0], avg2(pix[0], pix[1])); \
729 OP(p[1], avg2(pix[1], pix[2])); \
730 OP(p[2], avg2(pix[2], pix[3])); \
731 OP(p[3], avg2(pix[3], pix[4])); \
732 OP(p[4], avg2(pix[4], pix[5])); \
733 OP(p[5], avg2(pix[5], pix[6])); \
734 OP(p[6], avg2(pix[6], pix[7])); \
735 OP(p[7], avg2(pix[7], pix[8])); \
736 pix += line_size; \
737 p += INCR; \
738 } while (--h); \
739} \
740 \
741static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
742{ \
743 BTYPE *p; \
744 const UINT8 *pix; \
745 const UINT8 *pix1; \
746 \
747 p = block; \
748 pix = pixels; \
749 pix1 = pixels + line_size; \
750 do { \
751 OP(p[0], avg2(pix[0], pix1[0])); \
752 OP(p[1], avg2(pix[1], pix1[1])); \
753 OP(p[2], avg2(pix[2], pix1[2])); \
754 OP(p[3], avg2(pix[3], pix1[3])); \
755 OP(p[4], avg2(pix[4], pix1[4])); \
756 OP(p[5], avg2(pix[5], pix1[5])); \
757 OP(p[6], avg2(pix[6], pix1[6])); \
758 OP(p[7], avg2(pix[7], pix1[7])); \
759 pix += line_size; \
760 pix1 += line_size; \
761 p += INCR; \
762 } while(--h); \
763} \
764 \
765static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
766{ \
767 BTYPE *p; \
768 const UINT8 *pix; \
769 const UINT8 *pix1; \
770 \
771 p = block; \
772 pix = pixels; \
773 pix1 = pixels + line_size; \
774 do { \
775 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
776 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
777 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
778 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
779 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
780 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
781 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
782 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
783 pix += line_size; \
784 pix1 += line_size; \
785 p += INCR; \
786 } while(--h); \
787} \
788 \
789void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
790 OPNAME ## _pixels, \
791 OPNAME ## _pixels_x2, \
792 OPNAME ## _pixels_y2, \
793 OPNAME ## _pixels_xy2, \
794};
795
de6d9b64
FB
796/* rounding primitives */
797#define avg2(a,b) ((a+b+1)>>1)
798#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
799
de6d9b64
FB
800#define op_avg(a, b) a = avg2(a, b)
801#define op_sub(a, b) a -= b
3aa102be 802#define op_put(a, b) a = b
de6d9b64 803
de6d9b64 804PIXOP(DCTELEM, sub, op_sub, 8)
3aa102be
MN
805PIXOP(uint8_t, avg, op_avg, line_size)
806PIXOP(uint8_t, put, op_put, line_size)
de6d9b64
FB
807
808/* not rounding primitives */
809#undef avg2
810#undef avg4
811#define avg2(a,b) ((a+b)>>1)
812#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
813
3aa102be
MN
814PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
815PIXOP(uint8_t, put_no_rnd, op_put, line_size)
de6d9b64
FB
816/* motion estimation */
817
818#undef avg2
819#undef avg4
57060b1e
FB
820#endif
821
de6d9b64
FB
822#define avg2(a,b) ((a+b+1)>>1)
823#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
824
b3184779 825static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
826{
827 const int A=(16-x16)*(16-y16);
828 const int B=( x16)*(16-y16);
829 const int C=(16-x16)*( y16);
830 const int D=( x16)*( y16);
831 int i;
832 rounder= 128 - rounder;
833
834 for(i=0; i<h; i++)
835 {
b3184779
MN
836 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
837 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
838 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
839 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
840 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
841 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
842 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
843 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
844 dst+= stride;
845 src+= stride;
44eb4951
MN
846 }
847}
848
b3184779 849static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951 850{
44eb4951
MN
851 int i;
852 for(i=0; i<h; i++)
853 {
b3184779
MN
854 ST32(dst , LD32(src ));
855 ST32(dst+4 , LD32(src+4 ));
856 ST32(dst+8 , LD32(src+8 ));
857 ST32(dst+12, LD32(src+12));
858 dst[16]= src[16];
44eb4951
MN
859 dst+=dstStride;
860 src+=srcStride;
861 }
862}
863
b3184779 864static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951
MN
865{
866 int i;
b3184779 867 for(i=0; i<h; i++)
44eb4951 868 {
b3184779
MN
869 ST32(dst , LD32(src ));
870 ST32(dst+4 , LD32(src+4 ));
871 dst[8]= src[8];
44eb4951
MN
872 dst+=dstStride;
873 src+=srcStride;
874 }
875}
876
b3184779
MN
877#define QPEL_MC(r, OPNAME, RND, OP) \
878static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
879 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
880 int i;\
881 for(i=0; i<h; i++)\
882 {\
883 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
884 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
885 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
886 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
887 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
888 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
889 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
890 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
891 dst+=dstStride;\
892 src+=srcStride;\
893 }\
44eb4951
MN
894}\
895\
b3184779
MN
896static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
897 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
898 int i;\
899 for(i=0; i<w; i++)\
900 {\
901 const int src0= src[0*srcStride];\
902 const int src1= src[1*srcStride];\
903 const int src2= src[2*srcStride];\
904 const int src3= src[3*srcStride];\
905 const int src4= src[4*srcStride];\
906 const int src5= src[5*srcStride];\
907 const int src6= src[6*srcStride];\
908 const int src7= src[7*srcStride];\
909 const int src8= src[8*srcStride];\
910 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
911 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
912 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
913 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
914 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
915 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
916 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
917 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
918 dst++;\
919 src++;\
920 }\
921}\
922\
923static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
924 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
925 int i;\
926 for(i=0; i<h; i++)\
927 {\
928 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
929 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
930 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
931 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
932 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
933 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
934 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
935 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
936 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
937 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
938 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
939 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
940 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
941 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
942 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
943 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
944 dst+=dstStride;\
945 src+=srcStride;\
946 }\
947}\
948\
949static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
950 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
951 int i;\
952 for(i=0; i<w; i++)\
953 {\
954 const int src0= src[0*srcStride];\
955 const int src1= src[1*srcStride];\
956 const int src2= src[2*srcStride];\
957 const int src3= src[3*srcStride];\
958 const int src4= src[4*srcStride];\
959 const int src5= src[5*srcStride];\
960 const int src6= src[6*srcStride];\
961 const int src7= src[7*srcStride];\
962 const int src8= src[8*srcStride];\
963 const int src9= src[9*srcStride];\
964 const int src10= src[10*srcStride];\
965 const int src11= src[11*srcStride];\
966 const int src12= src[12*srcStride];\
967 const int src13= src[13*srcStride];\
968 const int src14= src[14*srcStride];\
969 const int src15= src[15*srcStride];\
970 const int src16= src[16*srcStride];\
971 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
972 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
973 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
974 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
975 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
976 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
977 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
978 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
979 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
980 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
981 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
982 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
983 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
984 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
985 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
986 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
987 dst++;\
988 src++;\
989 }\
990}\
991\
992static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
993 OPNAME ## pixels8(dst, src, stride, 8);\
994}\
995\
996static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 997 UINT8 half[64];\
b3184779
MN
998 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
999 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1000}\
1001\
b3184779
MN
1002static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1003 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1004}\
1005\
b3184779 1006static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 1007 UINT8 half[64];\
b3184779
MN
1008 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1009 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1010}\
1011\
b3184779
MN
1012static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1013 UINT8 full[16*9];\
44eb4951 1014 UINT8 half[64];\
b3184779
MN
1015 copy_block9(full, src, 16, stride, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1017 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1018}\
1019\
b3184779
MN
1020static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1021 UINT8 full[16*9];\
1022 copy_block9(full, src, 16, stride, 9);\
1023 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16, 8);\
44eb4951
MN
1024}\
1025\
b3184779
MN
1026static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1027 UINT8 full[16*9];\
44eb4951 1028 UINT8 half[64];\
b3184779
MN
1029 copy_block9(full, src, 16, stride, 9);\
1030 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16, 8);\
1031 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1032}\
b3184779
MN
1033static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1034 UINT8 full[16*9];\
44eb4951 1035 UINT8 halfH[72];\
7ff037e9 1036 UINT8 halfV[64];\
44eb4951 1037 UINT8 halfHV[64];\
b3184779
MN
1038 copy_block9(full, src, 16, stride, 9);\
1039 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1040 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1041 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1042 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1043}\
b3184779
MN
1044static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1045 UINT8 full[16*9];\
44eb4951 1046 UINT8 halfH[72];\
7ff037e9 1047 UINT8 halfV[64];\
44eb4951 1048 UINT8 halfHV[64];\
b3184779
MN
1049 copy_block9(full, src, 16, stride, 9);\
1050 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1051 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1052 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1053 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1054}\
b3184779
MN
1055static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1056 UINT8 full[16*9];\
44eb4951 1057 UINT8 halfH[72];\
7ff037e9 1058 UINT8 halfV[64];\
44eb4951 1059 UINT8 halfHV[64];\
b3184779
MN
1060 copy_block9(full, src, 16, stride, 9);\
1061 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1062 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1063 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1064 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1065}\
b3184779
MN
1066static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1067 UINT8 full[16*9];\
44eb4951 1068 UINT8 halfH[72];\
7ff037e9 1069 UINT8 halfV[64];\
44eb4951 1070 UINT8 halfHV[64];\
b3184779
MN
1071 copy_block9(full, src, 16, stride, 9);\
1072 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1073 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1074 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1075 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1076}\
b3184779 1077static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
1078 UINT8 halfH[72];\
1079 UINT8 halfHV[64];\
b3184779
MN
1080 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1081 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1082 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1083}\
b3184779 1084static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
1085 UINT8 halfH[72];\
1086 UINT8 halfHV[64];\
b3184779
MN
1087 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1088 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1089 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1090}\
b3184779
MN
1091static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1092 UINT8 full[16*9];\
44eb4951 1093 UINT8 halfH[72];\
7ff037e9 1094 UINT8 halfV[64];\
44eb4951 1095 UINT8 halfHV[64];\
b3184779
MN
1096 copy_block9(full, src, 16, stride, 9);\
1097 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1098 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16, 8);\
1099 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1100 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1101}\
b3184779
MN
1102static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1103 UINT8 full[16*9];\
44eb4951 1104 UINT8 halfH[72];\
7ff037e9 1105 UINT8 halfV[64];\
44eb4951 1106 UINT8 halfHV[64];\
b3184779
MN
1107 copy_block9(full, src, 16, stride, 9);\
1108 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1109 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16, 8);\
1110 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8, 8);\
1111 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1112}\
b3184779 1113static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 1114 UINT8 halfH[72];\
b3184779
MN
1115 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1116 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8, 8);\
1117}\
1118static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
1119 OPNAME ## pixels16(dst, src, stride, 16);\
1120}\
1121\
1122static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1123 UINT8 half[256];\
1124 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1125 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1126}\
1127\
1128static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1129 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1130}\
b3184779
MN
1131\
1132static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1133 UINT8 half[256];\
1134 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1135 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1136}\
1137\
1138static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1139 UINT8 full[24*17];\
1140 UINT8 half[256];\
1141 copy_block17(full, src, 24, stride, 17);\
1142 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1143 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1144}\
1145\
1146static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1147 UINT8 full[24*17];\
1148 copy_block17(full, src, 24, stride, 17);\
1149 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1150}\
1151\
1152static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1153 UINT8 full[24*17];\
1154 UINT8 half[256];\
1155 copy_block17(full, src, 24, stride, 17);\
1156 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1157 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1158}\
1159static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1160 UINT8 full[24*17];\
1161 UINT8 halfH[272];\
1162 UINT8 halfV[256];\
1163 UINT8 halfHV[256];\
1164 copy_block17(full, src, 24, stride, 17);\
1165 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1166 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1167 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1168 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1169}\
1170static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1171 UINT8 full[24*17];\
1172 UINT8 halfH[272];\
1173 UINT8 halfV[256];\
1174 UINT8 halfHV[256];\
1175 copy_block17(full, src, 24, stride, 17);\
1176 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1177 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1178 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1179 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1180}\
1181static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1182 UINT8 full[24*17];\
1183 UINT8 halfH[272];\
1184 UINT8 halfV[256];\
1185 UINT8 halfHV[256];\
1186 copy_block17(full, src, 24, stride, 17);\
1187 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1188 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1189 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1190 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1191}\
1192static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1193 UINT8 full[24*17];\
1194 UINT8 halfH[272];\
1195 UINT8 halfV[256];\
1196 UINT8 halfHV[256];\
1197 copy_block17(full, src, 24, stride, 17);\
1198 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1199 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1200 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1201 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1202}\
1203static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1204 UINT8 halfH[272];\
1205 UINT8 halfHV[256];\
1206 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1207 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1208 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1209}\
1210static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1211 UINT8 halfH[272];\
1212 UINT8 halfHV[256];\
1213 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1214 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1215 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1216}\
1217static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1218 UINT8 full[24*17];\
1219 UINT8 halfH[272];\
1220 UINT8 halfV[256];\
1221 UINT8 halfHV[256];\
1222 copy_block17(full, src, 24, stride, 17);\
1223 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1224 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1225 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1226 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1227}\
1228static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1229 UINT8 full[24*17];\
1230 UINT8 halfH[272];\
1231 UINT8 halfV[256];\
1232 UINT8 halfHV[256];\
1233 copy_block17(full, src, 24, stride, 17);\
1234 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1235 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1236 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1237 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1238}\
1239static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1240 UINT8 halfH[272];\
1241 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1242 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
1243}\
1244qpel_mc_func OPNAME ## qpel_pixels_tab[2][16]={ \
1245 {\
1246 OPNAME ## qpel16_mc00_c, \
1247 OPNAME ## qpel16_mc10_c, \
1248 OPNAME ## qpel16_mc20_c, \
1249 OPNAME ## qpel16_mc30_c, \
1250 OPNAME ## qpel16_mc01_c, \
1251 OPNAME ## qpel16_mc11_c, \
1252 OPNAME ## qpel16_mc21_c, \
1253 OPNAME ## qpel16_mc31_c, \
1254 OPNAME ## qpel16_mc02_c, \
1255 OPNAME ## qpel16_mc12_c, \
1256 OPNAME ## qpel16_mc22_c, \
1257 OPNAME ## qpel16_mc32_c, \
1258 OPNAME ## qpel16_mc03_c, \
1259 OPNAME ## qpel16_mc13_c, \
1260 OPNAME ## qpel16_mc23_c, \
1261 OPNAME ## qpel16_mc33_c, \
1262 },{\
1263 OPNAME ## qpel8_mc00_c, \
1264 OPNAME ## qpel8_mc10_c, \
1265 OPNAME ## qpel8_mc20_c, \
1266 OPNAME ## qpel8_mc30_c, \
1267 OPNAME ## qpel8_mc01_c, \
1268 OPNAME ## qpel8_mc11_c, \
1269 OPNAME ## qpel8_mc21_c, \
1270 OPNAME ## qpel8_mc31_c, \
1271 OPNAME ## qpel8_mc02_c, \
1272 OPNAME ## qpel8_mc12_c, \
1273 OPNAME ## qpel8_mc22_c, \
1274 OPNAME ## qpel8_mc32_c, \
1275 OPNAME ## qpel8_mc03_c, \
1276 OPNAME ## qpel8_mc13_c, \
1277 OPNAME ## qpel8_mc23_c, \
1278 OPNAME ## qpel8_mc33_c, \
1279 }\
44eb4951
MN
1280};
1281
b3184779
MN
1282#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1283#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1284#define op_put(a, b) a = cm[((b) + 16)>>5]
1285#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1286
1287QPEL_MC(0, put_ , _ , op_put)
1288QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1289QPEL_MC(0, avg_ , _ , op_avg)
1290//QPEL_MC(1, avg_no_rnd , _ , op_avg)
1291#undef op_avg
1292#undef op_avg_no_rnd
1293#undef op_put
1294#undef op_put_no_rnd
44eb4951 1295
ba6802de 1296int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1297{
1298 int s, i;
1299
1300 s = 0;
ba6802de 1301 for(i=0;i<16;i++) {
de6d9b64
FB
1302 s += abs(pix1[0] - pix2[0]);
1303 s += abs(pix1[1] - pix2[1]);
1304 s += abs(pix1[2] - pix2[2]);
1305 s += abs(pix1[3] - pix2[3]);
1306 s += abs(pix1[4] - pix2[4]);
1307 s += abs(pix1[5] - pix2[5]);
1308 s += abs(pix1[6] - pix2[6]);
1309 s += abs(pix1[7] - pix2[7]);
1310 s += abs(pix1[8] - pix2[8]);
1311 s += abs(pix1[9] - pix2[9]);
1312 s += abs(pix1[10] - pix2[10]);
1313 s += abs(pix1[11] - pix2[11]);
1314 s += abs(pix1[12] - pix2[12]);
1315 s += abs(pix1[13] - pix2[13]);
1316 s += abs(pix1[14] - pix2[14]);
1317 s += abs(pix1[15] - pix2[15]);
1318 pix1 += line_size;
1319 pix2 += line_size;
1320 }
1321 return s;
1322}
1323
ba6802de 1324int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1325{
1326 int s, i;
1327
1328 s = 0;
ba6802de 1329 for(i=0;i<16;i++) {
de6d9b64
FB
1330 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1331 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1332 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1333 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1334 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1335 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1336 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1337 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1338 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1339 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1340 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1341 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1342 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1343 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1344 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1345 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1346 pix1 += line_size;
1347 pix2 += line_size;
1348 }
1349 return s;
1350}
1351
ba6802de 1352int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1353{
1354 int s, i;
1355 UINT8 *pix3 = pix2 + line_size;
1356
1357 s = 0;
ba6802de 1358 for(i=0;i<16;i++) {
de6d9b64
FB
1359 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1360 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1361 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1362 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1363 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1364 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1365 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1366 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1367 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1368 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1369 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1370 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1371 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1372 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1373 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1374 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1375 pix1 += line_size;
1376 pix2 += line_size;
1377 pix3 += line_size;
1378 }
1379 return s;
1380}
1381
ba6802de 1382int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1383{
1384 int s, i;
1385 UINT8 *pix3 = pix2 + line_size;
1386
1387 s = 0;
ba6802de 1388 for(i=0;i<16;i++) {
de6d9b64
FB
1389 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1390 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1391 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1392 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1393 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1394 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1395 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1396 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1397 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1398 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1399 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1400 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1401 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1402 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1403 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1404 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1405 pix1 += line_size;
1406 pix2 += line_size;
1407 pix3 += line_size;
1408 }
1409 return s;
1410}
1411
ba6802de
MN
1412int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1413{
1414 int s, i;
1415
1416 s = 0;
1417 for(i=0;i<8;i++) {
1418 s += abs(pix1[0] - pix2[0]);
1419 s += abs(pix1[1] - pix2[1]);
1420 s += abs(pix1[2] - pix2[2]);
1421 s += abs(pix1[3] - pix2[3]);
1422 s += abs(pix1[4] - pix2[4]);
1423 s += abs(pix1[5] - pix2[5]);
1424 s += abs(pix1[6] - pix2[6]);
1425 s += abs(pix1[7] - pix2[7]);
1426 pix1 += line_size;
1427 pix2 += line_size;
1428 }
1429 return s;
1430}
1431
1432int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1433{
1434 int s, i;
1435
1436 s = 0;
1437 for(i=0;i<8;i++) {
1438 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1439 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1440 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1441 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1442 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1443 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1444 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1445 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1446 pix1 += line_size;
1447 pix2 += line_size;
1448 }
1449 return s;
1450}
1451
1452int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1453{
1454 int s, i;
1455 UINT8 *pix3 = pix2 + line_size;
1456
1457 s = 0;
1458 for(i=0;i<8;i++) {
1459 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1460 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1461 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1462 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1463 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1464 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1465 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1466 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1467 pix1 += line_size;
1468 pix2 += line_size;
1469 pix3 += line_size;
1470 }
1471 return s;
1472}
1473
1474int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1475{
1476 int s, i;
1477 UINT8 *pix3 = pix2 + line_size;
1478
1479 s = 0;
1480 for(i=0;i<8;i++) {
1481 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1482 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1483 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1484 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1485 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1486 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1487 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1488 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1489 pix1 += line_size;
1490 pix2 += line_size;
1491 pix3 += line_size;
1492 }
1493 return s;
1494}
1495
e0eac44e
FB
1496/* permute block according so that it corresponds to the MMX idct
1497 order */
2ad1516a 1498void block_permute(INT16 *block, UINT8 *permutation)
d962f6fd
A
1499{
1500 int i;
1501 INT16 temp[64];
1502
2ad1516a 1503 for(i=0; i<64; i++) temp[ permutation[i] ] = block[i];
d962f6fd
A
1504
1505 for(i=0; i<64; i++) block[i] = temp[i];
d962f6fd 1506}
e0eac44e 1507
649c00c9
MN
1508void clear_blocks_c(DCTELEM *blocks)
1509{
1510 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1511}
1512
e0eac44e
FB
1513void dsputil_init(void)
1514{
d2975f8d 1515 int i;
e0eac44e 1516
de6d9b64
FB
1517 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1518 for(i=0;i<MAX_NEG_CROP;i++) {
1519 cropTbl[i] = 0;
1520 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1521 }
1522
1523 for(i=0;i<512;i++) {
1524 squareTbl[i] = (i - 256) * (i - 256);
1525 }
1526
1527 get_pixels = get_pixels_c;
9dbcbd92 1528 diff_pixels = diff_pixels_c;
de6d9b64
FB
1529 put_pixels_clamped = put_pixels_clamped_c;
1530 add_pixels_clamped = add_pixels_clamped_c;
44eb4951 1531 gmc1= gmc1_c;
649c00c9 1532 clear_blocks= clear_blocks_c;
3aa102be
MN
1533 pix_sum= pix_sum_c;
1534 pix_norm1= pix_norm1_c;
de6d9b64 1535
ba6802de
MN
1536 pix_abs16x16 = pix_abs16x16_c;
1537 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1538 pix_abs16x16_y2 = pix_abs16x16_y2_c;
de6d9b64 1539 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
ba6802de
MN
1540 pix_abs8x8 = pix_abs8x8_c;
1541 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1542 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1543 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
de6d9b64 1544
980fc7b8 1545#ifdef HAVE_MMX
de6d9b64
FB
1546 dsputil_init_mmx();
1547#endif
3d03c0a2
FB
1548#ifdef ARCH_ARMV4L
1549 dsputil_init_armv4l();
1550#endif
c34270f5
FB
1551#ifdef HAVE_MLIB
1552 dsputil_init_mlib();
c34270f5 1553#endif
1e98dffb
NK
1554#ifdef ARCH_ALPHA
1555 dsputil_init_alpha();
1e98dffb 1556#endif
59925ef2 1557#ifdef ARCH_POWERPC
ab6c65f6 1558 dsputil_init_ppc();
a43bd1d7 1559#endif
d46aba26
LS
1560#ifdef HAVE_MMI
1561 dsputil_init_mmi();
d46aba26 1562#endif
c34270f5 1563
2ad1516a 1564 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
de6d9b64 1565}
43f1708f 1566
57060b1e
FB
1567/* remove any non bit exact operation (testing purpose) */
1568void avcodec_set_bit_exact(void)
1569{
5596c60c 1570 ff_bit_exact=1;
57060b1e
FB
1571#ifdef HAVE_MMX
1572 dsputil_set_bit_exact_mmx();
1573#endif
1574}
1575
43f1708f
J
1576void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1577 int orig_linesize[3], int coded_linesize,
1578 AVCodecContext *avctx)
1579{
1580 int quad, diff, x, y;
1581 UINT8 *orig, *coded;
1582 UINT32 *sq = squareTbl + 256;
1583
1584 quad = 0;
1585 diff = 0;
1586
1587 /* Luminance */
1588 orig = orig_image[0];
1589 coded = coded_image[0];
1590
1591 for (y=0;y<avctx->height;y++) {
1592 for (x=0;x<avctx->width;x++) {
1593 diff = *(orig + x) - *(coded + x);
1594 quad += sq[diff];
1595 }
1596 orig += orig_linesize[0];
1597 coded += coded_linesize;
1598 }
1599
1600 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1601
1602 if (avctx->psnr_y) {
1603 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1604 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1605 } else
1606 avctx->psnr_y = 99.99;
1607}
1608