fb98f21c1fe3fc7cc7caab7d685ada5e800c2a09
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 */
21
22 /**
23 * @file dsputil.c
24 * DSP utils
25 */
26
27 #include "avcodec.h"
28 #include "dsputil.h"
29 #include "mpegvideo.h"
30 #include "simple_idct.h"
31 #include "faandct.h"
32
33 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34 uint32_t squareTbl[512];
35
36 const uint8_t ff_zigzag_direct[64] = {
37 0, 1, 8, 16, 9, 2, 3, 10,
38 17, 24, 32, 25, 18, 11, 4, 5,
39 12, 19, 26, 33, 40, 48, 41, 34,
40 27, 20, 13, 6, 7, 14, 21, 28,
41 35, 42, 49, 56, 57, 50, 43, 36,
42 29, 22, 15, 23, 30, 37, 44, 51,
43 58, 59, 52, 45, 38, 31, 39, 46,
44 53, 60, 61, 54, 47, 55, 62, 63
45 };
46
47 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48 uint16_t __align8 inv_zigzag_direct16[64];
49
50 const uint8_t ff_alternate_horizontal_scan[64] = {
51 0, 1, 2, 3, 8, 9, 16, 17,
52 10, 11, 4, 5, 6, 7, 15, 14,
53 13, 12, 19, 18, 24, 25, 32, 33,
54 26, 27, 20, 21, 22, 23, 28, 29,
55 30, 31, 34, 35, 40, 41, 48, 49,
56 42, 43, 36, 37, 38, 39, 44, 45,
57 46, 47, 50, 51, 56, 57, 58, 59,
58 52, 53, 54, 55, 60, 61, 62, 63,
59 };
60
61 const uint8_t ff_alternate_vertical_scan[64] = {
62 0, 8, 16, 24, 1, 9, 2, 10,
63 17, 25, 32, 40, 48, 56, 57, 49,
64 41, 33, 26, 18, 3, 11, 4, 12,
65 19, 27, 34, 42, 50, 58, 35, 43,
66 51, 59, 20, 28, 5, 13, 6, 14,
67 21, 29, 36, 44, 52, 60, 37, 45,
68 53, 61, 22, 30, 7, 15, 23, 31,
69 38, 46, 54, 62, 39, 47, 55, 63,
70 };
71
72 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73 const uint32_t inverse[256]={
74 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
75 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
76 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
77 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
78 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
79 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
80 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
81 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
82 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
83 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
84 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
85 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
86 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
87 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
88 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
89 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
90 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
91 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
92 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
93 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
94 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
95 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
96 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
97 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
98 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
99 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
100 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
101 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
102 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
103 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
104 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
105 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
106 };
107
108 /* Input permutation for the simple_idct_mmx */
109 static const uint8_t simple_mmx_permutation[64]={
110 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
111 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
112 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
113 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
114 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
115 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
116 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
117 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
118 };
119
120 static int pix_sum_c(uint8_t * pix, int line_size)
121 {
122 int s, i, j;
123
124 s = 0;
125 for (i = 0; i < 16; i++) {
126 for (j = 0; j < 16; j += 8) {
127 s += pix[0];
128 s += pix[1];
129 s += pix[2];
130 s += pix[3];
131 s += pix[4];
132 s += pix[5];
133 s += pix[6];
134 s += pix[7];
135 pix += 8;
136 }
137 pix += line_size - 16;
138 }
139 return s;
140 }
141
142 static int pix_norm1_c(uint8_t * pix, int line_size)
143 {
144 int s, i, j;
145 uint32_t *sq = squareTbl + 256;
146
147 s = 0;
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
150 #if 0
151 s += sq[pix[0]];
152 s += sq[pix[1]];
153 s += sq[pix[2]];
154 s += sq[pix[3]];
155 s += sq[pix[4]];
156 s += sq[pix[5]];
157 s += sq[pix[6]];
158 s += sq[pix[7]];
159 #else
160 #if LONG_MAX > 2147483647
161 register uint64_t x=*(uint64_t*)pix;
162 s += sq[x&0xff];
163 s += sq[(x>>8)&0xff];
164 s += sq[(x>>16)&0xff];
165 s += sq[(x>>24)&0xff];
166 s += sq[(x>>32)&0xff];
167 s += sq[(x>>40)&0xff];
168 s += sq[(x>>48)&0xff];
169 s += sq[(x>>56)&0xff];
170 #else
171 register uint32_t x=*(uint32_t*)pix;
172 s += sq[x&0xff];
173 s += sq[(x>>8)&0xff];
174 s += sq[(x>>16)&0xff];
175 s += sq[(x>>24)&0xff];
176 x=*(uint32_t*)(pix+4);
177 s += sq[x&0xff];
178 s += sq[(x>>8)&0xff];
179 s += sq[(x>>16)&0xff];
180 s += sq[(x>>24)&0xff];
181 #endif
182 #endif
183 pix += 8;
184 }
185 pix += line_size - 16;
186 }
187 return s;
188 }
189
190 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
191 int i;
192
193 for(i=0; i+8<=w; i+=8){
194 dst[i+0]= bswap_32(src[i+0]);
195 dst[i+1]= bswap_32(src[i+1]);
196 dst[i+2]= bswap_32(src[i+2]);
197 dst[i+3]= bswap_32(src[i+3]);
198 dst[i+4]= bswap_32(src[i+4]);
199 dst[i+5]= bswap_32(src[i+5]);
200 dst[i+6]= bswap_32(src[i+6]);
201 dst[i+7]= bswap_32(src[i+7]);
202 }
203 for(;i<w; i++){
204 dst[i+0]= bswap_32(src[i+0]);
205 }
206 }
207
208 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
209 {
210 int s, i;
211 uint32_t *sq = squareTbl + 256;
212
213 s = 0;
214 for (i = 0; i < 8; i++) {
215 s += sq[pix1[0] - pix2[0]];
216 s += sq[pix1[1] - pix2[1]];
217 s += sq[pix1[2] - pix2[2]];
218 s += sq[pix1[3] - pix2[3]];
219 s += sq[pix1[4] - pix2[4]];
220 s += sq[pix1[5] - pix2[5]];
221 s += sq[pix1[6] - pix2[6]];
222 s += sq[pix1[7] - pix2[7]];
223 pix1 += line_size;
224 pix2 += line_size;
225 }
226 return s;
227 }
228
229 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
230 {
231 int s, i;
232 uint32_t *sq = squareTbl + 256;
233
234 s = 0;
235 for (i = 0; i < 16; i++) {
236 s += sq[pix1[ 0] - pix2[ 0]];
237 s += sq[pix1[ 1] - pix2[ 1]];
238 s += sq[pix1[ 2] - pix2[ 2]];
239 s += sq[pix1[ 3] - pix2[ 3]];
240 s += sq[pix1[ 4] - pix2[ 4]];
241 s += sq[pix1[ 5] - pix2[ 5]];
242 s += sq[pix1[ 6] - pix2[ 6]];
243 s += sq[pix1[ 7] - pix2[ 7]];
244 s += sq[pix1[ 8] - pix2[ 8]];
245 s += sq[pix1[ 9] - pix2[ 9]];
246 s += sq[pix1[10] - pix2[10]];
247 s += sq[pix1[11] - pix2[11]];
248 s += sq[pix1[12] - pix2[12]];
249 s += sq[pix1[13] - pix2[13]];
250 s += sq[pix1[14] - pix2[14]];
251 s += sq[pix1[15] - pix2[15]];
252
253 pix1 += line_size;
254 pix2 += line_size;
255 }
256 return s;
257 }
258
259 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
260 {
261 int i;
262
263 /* read the pixels */
264 for(i=0;i<8;i++) {
265 block[0] = pixels[0];
266 block[1] = pixels[1];
267 block[2] = pixels[2];
268 block[3] = pixels[3];
269 block[4] = pixels[4];
270 block[5] = pixels[5];
271 block[6] = pixels[6];
272 block[7] = pixels[7];
273 pixels += line_size;
274 block += 8;
275 }
276 }
277
278 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
279 const uint8_t *s2, int stride){
280 int i;
281
282 /* read the pixels */
283 for(i=0;i<8;i++) {
284 block[0] = s1[0] - s2[0];
285 block[1] = s1[1] - s2[1];
286 block[2] = s1[2] - s2[2];
287 block[3] = s1[3] - s2[3];
288 block[4] = s1[4] - s2[4];
289 block[5] = s1[5] - s2[5];
290 block[6] = s1[6] - s2[6];
291 block[7] = s1[7] - s2[7];
292 s1 += stride;
293 s2 += stride;
294 block += 8;
295 }
296 }
297
298
299 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
300 int line_size)
301 {
302 int i;
303 uint8_t *cm = cropTbl + MAX_NEG_CROP;
304
305 /* read the pixels */
306 for(i=0;i<8;i++) {
307 pixels[0] = cm[block[0]];
308 pixels[1] = cm[block[1]];
309 pixels[2] = cm[block[2]];
310 pixels[3] = cm[block[3]];
311 pixels[4] = cm[block[4]];
312 pixels[5] = cm[block[5]];
313 pixels[6] = cm[block[6]];
314 pixels[7] = cm[block[7]];
315
316 pixels += line_size;
317 block += 8;
318 }
319 }
320
321 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
322 int line_size)
323 {
324 int i;
325 uint8_t *cm = cropTbl + MAX_NEG_CROP;
326
327 /* read the pixels */
328 for(i=0;i<8;i++) {
329 pixels[0] = cm[pixels[0] + block[0]];
330 pixels[1] = cm[pixels[1] + block[1]];
331 pixels[2] = cm[pixels[2] + block[2]];
332 pixels[3] = cm[pixels[3] + block[3]];
333 pixels[4] = cm[pixels[4] + block[4]];
334 pixels[5] = cm[pixels[5] + block[5]];
335 pixels[6] = cm[pixels[6] + block[6]];
336 pixels[7] = cm[pixels[7] + block[7]];
337 pixels += line_size;
338 block += 8;
339 }
340 }
341 #if 0
342
343 #define PIXOP2(OPNAME, OP) \
344 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
345 {\
346 int i;\
347 for(i=0; i<h; i++){\
348 OP(*((uint64_t*)block), LD64(pixels));\
349 pixels+=line_size;\
350 block +=line_size;\
351 }\
352 }\
353 \
354 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
355 {\
356 int i;\
357 for(i=0; i<h; i++){\
358 const uint64_t a= LD64(pixels );\
359 const uint64_t b= LD64(pixels+1);\
360 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
361 pixels+=line_size;\
362 block +=line_size;\
363 }\
364 }\
365 \
366 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
367 {\
368 int i;\
369 for(i=0; i<h; i++){\
370 const uint64_t a= LD64(pixels );\
371 const uint64_t b= LD64(pixels+1);\
372 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
373 pixels+=line_size;\
374 block +=line_size;\
375 }\
376 }\
377 \
378 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
379 {\
380 int i;\
381 for(i=0; i<h; i++){\
382 const uint64_t a= LD64(pixels );\
383 const uint64_t b= LD64(pixels+line_size);\
384 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
385 pixels+=line_size;\
386 block +=line_size;\
387 }\
388 }\
389 \
390 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
391 {\
392 int i;\
393 for(i=0; i<h; i++){\
394 const uint64_t a= LD64(pixels );\
395 const uint64_t b= LD64(pixels+line_size);\
396 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
397 pixels+=line_size;\
398 block +=line_size;\
399 }\
400 }\
401 \
402 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
403 {\
404 int i;\
405 const uint64_t a= LD64(pixels );\
406 const uint64_t b= LD64(pixels+1);\
407 uint64_t l0= (a&0x0303030303030303ULL)\
408 + (b&0x0303030303030303ULL)\
409 + 0x0202020202020202ULL;\
410 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
411 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
412 uint64_t l1,h1;\
413 \
414 pixels+=line_size;\
415 for(i=0; i<h; i+=2){\
416 uint64_t a= LD64(pixels );\
417 uint64_t b= LD64(pixels+1);\
418 l1= (a&0x0303030303030303ULL)\
419 + (b&0x0303030303030303ULL);\
420 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
421 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
422 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
423 pixels+=line_size;\
424 block +=line_size;\
425 a= LD64(pixels );\
426 b= LD64(pixels+1);\
427 l0= (a&0x0303030303030303ULL)\
428 + (b&0x0303030303030303ULL)\
429 + 0x0202020202020202ULL;\
430 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
431 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
432 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
433 pixels+=line_size;\
434 block +=line_size;\
435 }\
436 }\
437 \
438 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
439 {\
440 int i;\
441 const uint64_t a= LD64(pixels );\
442 const uint64_t b= LD64(pixels+1);\
443 uint64_t l0= (a&0x0303030303030303ULL)\
444 + (b&0x0303030303030303ULL)\
445 + 0x0101010101010101ULL;\
446 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
447 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
448 uint64_t l1,h1;\
449 \
450 pixels+=line_size;\
451 for(i=0; i<h; i+=2){\
452 uint64_t a= LD64(pixels );\
453 uint64_t b= LD64(pixels+1);\
454 l1= (a&0x0303030303030303ULL)\
455 + (b&0x0303030303030303ULL);\
456 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
457 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
458 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
459 pixels+=line_size;\
460 block +=line_size;\
461 a= LD64(pixels );\
462 b= LD64(pixels+1);\
463 l0= (a&0x0303030303030303ULL)\
464 + (b&0x0303030303030303ULL)\
465 + 0x0101010101010101ULL;\
466 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
467 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
468 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
469 pixels+=line_size;\
470 block +=line_size;\
471 }\
472 }\
473 \
474 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
475 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
476 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
477 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
478 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
479 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
480 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
481
482 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
483 #else // 64 bit variant
484
485 #define PIXOP2(OPNAME, OP) \
486 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
487 int i;\
488 for(i=0; i<h; i++){\
489 OP(*((uint16_t*)(block )), LD16(pixels ));\
490 pixels+=line_size;\
491 block +=line_size;\
492 }\
493 }\
494 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
495 int i;\
496 for(i=0; i<h; i++){\
497 OP(*((uint32_t*)(block )), LD32(pixels ));\
498 pixels+=line_size;\
499 block +=line_size;\
500 }\
501 }\
502 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
503 int i;\
504 for(i=0; i<h; i++){\
505 OP(*((uint32_t*)(block )), LD32(pixels ));\
506 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
507 pixels+=line_size;\
508 block +=line_size;\
509 }\
510 }\
511 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
512 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
513 }\
514 \
515 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
516 int src_stride1, int src_stride2, int h){\
517 int i;\
518 for(i=0; i<h; i++){\
519 uint32_t a,b;\
520 a= LD32(&src1[i*src_stride1 ]);\
521 b= LD32(&src2[i*src_stride2 ]);\
522 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
523 a= LD32(&src1[i*src_stride1+4]);\
524 b= LD32(&src2[i*src_stride2+4]);\
525 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
526 }\
527 }\
528 \
529 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530 int src_stride1, int src_stride2, int h){\
531 int i;\
532 for(i=0; i<h; i++){\
533 uint32_t a,b;\
534 a= LD32(&src1[i*src_stride1 ]);\
535 b= LD32(&src2[i*src_stride2 ]);\
536 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
537 a= LD32(&src1[i*src_stride1+4]);\
538 b= LD32(&src2[i*src_stride2+4]);\
539 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
540 }\
541 }\
542 \
543 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544 int src_stride1, int src_stride2, int h){\
545 int i;\
546 for(i=0; i<h; i++){\
547 uint32_t a,b;\
548 a= LD32(&src1[i*src_stride1 ]);\
549 b= LD32(&src2[i*src_stride2 ]);\
550 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
551 }\
552 }\
553 \
554 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
555 int src_stride1, int src_stride2, int h){\
556 int i;\
557 for(i=0; i<h; i++){\
558 uint32_t a,b;\
559 a= LD16(&src1[i*src_stride1 ]);\
560 b= LD16(&src2[i*src_stride2 ]);\
561 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
562 }\
563 }\
564 \
565 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
566 int src_stride1, int src_stride2, int h){\
567 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
568 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
569 }\
570 \
571 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
572 int src_stride1, int src_stride2, int h){\
573 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
574 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
575 }\
576 \
577 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
578 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
579 }\
580 \
581 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
582 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
583 }\
584 \
585 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
586 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
587 }\
588 \
589 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
590 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
591 }\
592 \
593 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
594 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
595 int i;\
596 for(i=0; i<h; i++){\
597 uint32_t a, b, c, d, l0, l1, h0, h1;\
598 a= LD32(&src1[i*src_stride1]);\
599 b= LD32(&src2[i*src_stride2]);\
600 c= LD32(&src3[i*src_stride3]);\
601 d= LD32(&src4[i*src_stride4]);\
602 l0= (a&0x03030303UL)\
603 + (b&0x03030303UL)\
604 + 0x02020202UL;\
605 h0= ((a&0xFCFCFCFCUL)>>2)\
606 + ((b&0xFCFCFCFCUL)>>2);\
607 l1= (c&0x03030303UL)\
608 + (d&0x03030303UL);\
609 h1= ((c&0xFCFCFCFCUL)>>2)\
610 + ((d&0xFCFCFCFCUL)>>2);\
611 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
612 a= LD32(&src1[i*src_stride1+4]);\
613 b= LD32(&src2[i*src_stride2+4]);\
614 c= LD32(&src3[i*src_stride3+4]);\
615 d= LD32(&src4[i*src_stride4+4]);\
616 l0= (a&0x03030303UL)\
617 + (b&0x03030303UL)\
618 + 0x02020202UL;\
619 h0= ((a&0xFCFCFCFCUL)>>2)\
620 + ((b&0xFCFCFCFCUL)>>2);\
621 l1= (c&0x03030303UL)\
622 + (d&0x03030303UL);\
623 h1= ((c&0xFCFCFCFCUL)>>2)\
624 + ((d&0xFCFCFCFCUL)>>2);\
625 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
626 }\
627 }\
628 \
629 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
630 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
631 }\
632 \
633 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
634 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
635 }\
636 \
637 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
638 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
639 }\
640 \
641 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
642 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
643 }\
644 \
645 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
646 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
647 int i;\
648 for(i=0; i<h; i++){\
649 uint32_t a, b, c, d, l0, l1, h0, h1;\
650 a= LD32(&src1[i*src_stride1]);\
651 b= LD32(&src2[i*src_stride2]);\
652 c= LD32(&src3[i*src_stride3]);\
653 d= LD32(&src4[i*src_stride4]);\
654 l0= (a&0x03030303UL)\
655 + (b&0x03030303UL)\
656 + 0x01010101UL;\
657 h0= ((a&0xFCFCFCFCUL)>>2)\
658 + ((b&0xFCFCFCFCUL)>>2);\
659 l1= (c&0x03030303UL)\
660 + (d&0x03030303UL);\
661 h1= ((c&0xFCFCFCFCUL)>>2)\
662 + ((d&0xFCFCFCFCUL)>>2);\
663 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
664 a= LD32(&src1[i*src_stride1+4]);\
665 b= LD32(&src2[i*src_stride2+4]);\
666 c= LD32(&src3[i*src_stride3+4]);\
667 d= LD32(&src4[i*src_stride4+4]);\
668 l0= (a&0x03030303UL)\
669 + (b&0x03030303UL)\
670 + 0x01010101UL;\
671 h0= ((a&0xFCFCFCFCUL)>>2)\
672 + ((b&0xFCFCFCFCUL)>>2);\
673 l1= (c&0x03030303UL)\
674 + (d&0x03030303UL);\
675 h1= ((c&0xFCFCFCFCUL)>>2)\
676 + ((d&0xFCFCFCFCUL)>>2);\
677 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
678 }\
679 }\
680 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
682 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
683 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
684 }\
685 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
686 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
687 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
688 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
689 }\
690 \
691 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
692 {\
693 int i, a0, b0, a1, b1;\
694 a0= pixels[0];\
695 b0= pixels[1] + 2;\
696 a0 += b0;\
697 b0 += pixels[2];\
698 \
699 pixels+=line_size;\
700 for(i=0; i<h; i+=2){\
701 a1= pixels[0];\
702 b1= pixels[1];\
703 a1 += b1;\
704 b1 += pixels[2];\
705 \
706 block[0]= (a1+a0)>>2; /* FIXME non put */\
707 block[1]= (b1+b0)>>2;\
708 \
709 pixels+=line_size;\
710 block +=line_size;\
711 \
712 a0= pixels[0];\
713 b0= pixels[1] + 2;\
714 a0 += b0;\
715 b0 += pixels[2];\
716 \
717 block[0]= (a1+a0)>>2;\
718 block[1]= (b1+b0)>>2;\
719 pixels+=line_size;\
720 block +=line_size;\
721 }\
722 }\
723 \
724 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
725 {\
726 int i;\
727 const uint32_t a= LD32(pixels );\
728 const uint32_t b= LD32(pixels+1);\
729 uint32_t l0= (a&0x03030303UL)\
730 + (b&0x03030303UL)\
731 + 0x02020202UL;\
732 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
733 + ((b&0xFCFCFCFCUL)>>2);\
734 uint32_t l1,h1;\
735 \
736 pixels+=line_size;\
737 for(i=0; i<h; i+=2){\
738 uint32_t a= LD32(pixels );\
739 uint32_t b= LD32(pixels+1);\
740 l1= (a&0x03030303UL)\
741 + (b&0x03030303UL);\
742 h1= ((a&0xFCFCFCFCUL)>>2)\
743 + ((b&0xFCFCFCFCUL)>>2);\
744 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
745 pixels+=line_size;\
746 block +=line_size;\
747 a= LD32(pixels );\
748 b= LD32(pixels+1);\
749 l0= (a&0x03030303UL)\
750 + (b&0x03030303UL)\
751 + 0x02020202UL;\
752 h0= ((a&0xFCFCFCFCUL)>>2)\
753 + ((b&0xFCFCFCFCUL)>>2);\
754 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
755 pixels+=line_size;\
756 block +=line_size;\
757 }\
758 }\
759 \
760 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
761 {\
762 int j;\
763 for(j=0; j<2; j++){\
764 int i;\
765 const uint32_t a= LD32(pixels );\
766 const uint32_t b= LD32(pixels+1);\
767 uint32_t l0= (a&0x03030303UL)\
768 + (b&0x03030303UL)\
769 + 0x02020202UL;\
770 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
771 + ((b&0xFCFCFCFCUL)>>2);\
772 uint32_t l1,h1;\
773 \
774 pixels+=line_size;\
775 for(i=0; i<h; i+=2){\
776 uint32_t a= LD32(pixels );\
777 uint32_t b= LD32(pixels+1);\
778 l1= (a&0x03030303UL)\
779 + (b&0x03030303UL);\
780 h1= ((a&0xFCFCFCFCUL)>>2)\
781 + ((b&0xFCFCFCFCUL)>>2);\
782 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
783 pixels+=line_size;\
784 block +=line_size;\
785 a= LD32(pixels );\
786 b= LD32(pixels+1);\
787 l0= (a&0x03030303UL)\
788 + (b&0x03030303UL)\
789 + 0x02020202UL;\
790 h0= ((a&0xFCFCFCFCUL)>>2)\
791 + ((b&0xFCFCFCFCUL)>>2);\
792 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
793 pixels+=line_size;\
794 block +=line_size;\
795 }\
796 pixels+=4-line_size*(h+1);\
797 block +=4-line_size*h;\
798 }\
799 }\
800 \
801 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
802 {\
803 int j;\
804 for(j=0; j<2; j++){\
805 int i;\
806 const uint32_t a= LD32(pixels );\
807 const uint32_t b= LD32(pixels+1);\
808 uint32_t l0= (a&0x03030303UL)\
809 + (b&0x03030303UL)\
810 + 0x01010101UL;\
811 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
812 + ((b&0xFCFCFCFCUL)>>2);\
813 uint32_t l1,h1;\
814 \
815 pixels+=line_size;\
816 for(i=0; i<h; i+=2){\
817 uint32_t a= LD32(pixels );\
818 uint32_t b= LD32(pixels+1);\
819 l1= (a&0x03030303UL)\
820 + (b&0x03030303UL);\
821 h1= ((a&0xFCFCFCFCUL)>>2)\
822 + ((b&0xFCFCFCFCUL)>>2);\
823 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
824 pixels+=line_size;\
825 block +=line_size;\
826 a= LD32(pixels );\
827 b= LD32(pixels+1);\
828 l0= (a&0x03030303UL)\
829 + (b&0x03030303UL)\
830 + 0x01010101UL;\
831 h0= ((a&0xFCFCFCFCUL)>>2)\
832 + ((b&0xFCFCFCFCUL)>>2);\
833 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
834 pixels+=line_size;\
835 block +=line_size;\
836 }\
837 pixels+=4-line_size*(h+1);\
838 block +=4-line_size*h;\
839 }\
840 }\
841 \
842 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
843 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
844 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
845 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
846 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
847 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
848 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
849 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
850
851 #define op_avg(a, b) a = rnd_avg32(a, b)
852 #endif
853 #define op_put(a, b) a = b
854
855 PIXOP2(avg, op_avg)
856 PIXOP2(put, op_put)
857 #undef op_avg
858 #undef op_put
859
860 #define avg2(a,b) ((a+b+1)>>1)
861 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
862
863
864 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
865 {
866 const int A=(16-x16)*(16-y16);
867 const int B=( x16)*(16-y16);
868 const int C=(16-x16)*( y16);
869 const int D=( x16)*( y16);
870 int i;
871
872 for(i=0; i<h; i++)
873 {
874 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
875 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
876 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
877 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
878 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
879 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
880 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
881 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
882 dst+= stride;
883 src+= stride;
884 }
885 }
886
887 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
888 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
889 {
890 int y, vx, vy;
891 const int s= 1<<shift;
892
893 width--;
894 height--;
895
896 for(y=0; y<h; y++){
897 int x;
898
899 vx= ox;
900 vy= oy;
901 for(x=0; x<8; x++){ //XXX FIXME optimize
902 int src_x, src_y, frac_x, frac_y, index;
903
904 src_x= vx>>16;
905 src_y= vy>>16;
906 frac_x= src_x&(s-1);
907 frac_y= src_y&(s-1);
908 src_x>>=shift;
909 src_y>>=shift;
910
911 if((unsigned)src_x < width){
912 if((unsigned)src_y < height){
913 index= src_x + src_y*stride;
914 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
915 + src[index +1]* frac_x )*(s-frac_y)
916 + ( src[index+stride ]*(s-frac_x)
917 + src[index+stride+1]* frac_x )* frac_y
918 + r)>>(shift*2);
919 }else{
920 index= src_x + clip(src_y, 0, height)*stride;
921 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
922 + src[index +1]* frac_x )*s
923 + r)>>(shift*2);
924 }
925 }else{
926 if((unsigned)src_y < height){
927 index= clip(src_x, 0, width) + src_y*stride;
928 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
929 + src[index+stride ]* frac_y )*s
930 + r)>>(shift*2);
931 }else{
932 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
933 dst[y*stride + x]= src[index ];
934 }
935 }
936
937 vx+= dxx;
938 vy+= dyx;
939 }
940 ox += dxy;
941 oy += dyy;
942 }
943 }
944
945 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
946 switch(width){
947 case 2: put_pixels2_c (dst, src, stride, height); break;
948 case 4: put_pixels4_c (dst, src, stride, height); break;
949 case 8: put_pixels8_c (dst, src, stride, height); break;
950 case 16:put_pixels16_c(dst, src, stride, height); break;
951 }
952 }
953
954 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
955 int i,j;
956 for (i=0; i < height; i++) {
957 for (j=0; j < width; j++) {
958 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
959 }
960 src += stride;
961 dst += stride;
962 }
963 }
964
965 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
966 int i,j;
967 for (i=0; i < height; i++) {
968 for (j=0; j < width; j++) {
969 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
970 }
971 src += stride;
972 dst += stride;
973 }
974 }
975
976 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
977 int i,j;
978 for (i=0; i < height; i++) {
979 for (j=0; j < width; j++) {
980 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
981 }
982 src += stride;
983 dst += stride;
984 }
985 }
986
987 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
988 int i,j;
989 for (i=0; i < height; i++) {
990 for (j=0; j < width; j++) {
991 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
992 }
993 src += stride;
994 dst += stride;
995 }
996 }
997
998 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
999 int i,j;
1000 for (i=0; i < height; i++) {
1001 for (j=0; j < width; j++) {
1002 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1003 }
1004 src += stride;
1005 dst += stride;
1006 }
1007 }
1008
1009 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1010 int i,j;
1011 for (i=0; i < height; i++) {
1012 for (j=0; j < width; j++) {
1013 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1014 }
1015 src += stride;
1016 dst += stride;
1017 }
1018 }
1019
1020 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1021 int i,j;
1022 for (i=0; i < height; i++) {
1023 for (j=0; j < width; j++) {
1024 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1025 }
1026 src += stride;
1027 dst += stride;
1028 }
1029 }
1030
1031 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1032 int i,j;
1033 for (i=0; i < height; i++) {
1034 for (j=0; j < width; j++) {
1035 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1036 }
1037 src += stride;
1038 dst += stride;
1039 }
1040 }
1041
1042 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1043 switch(width){
1044 case 2: avg_pixels2_c (dst, src, stride, height); break;
1045 case 4: avg_pixels4_c (dst, src, stride, height); break;
1046 case 8: avg_pixels8_c (dst, src, stride, height); break;
1047 case 16:avg_pixels16_c(dst, src, stride, height); break;
1048 }
1049 }
1050
1051 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1052 int i,j;
1053 for (i=0; i < height; i++) {
1054 for (j=0; j < width; j++) {
1055 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1056 }
1057 src += stride;
1058 dst += stride;
1059 }
1060 }
1061
1062 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1063 int i,j;
1064 for (i=0; i < height; i++) {
1065 for (j=0; j < width; j++) {
1066 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1067 }
1068 src += stride;
1069 dst += stride;
1070 }
1071 }
1072
1073 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1074 int i,j;
1075 for (i=0; i < height; i++) {
1076 for (j=0; j < width; j++) {
1077 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1078 }
1079 src += stride;
1080 dst += stride;
1081 }
1082 }
1083
1084 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1085 int i,j;
1086 for (i=0; i < height; i++) {
1087 for (j=0; j < width; j++) {
1088 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1089 }
1090 src += stride;
1091 dst += stride;
1092 }
1093 }
1094
1095 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1096 int i,j;
1097 for (i=0; i < height; i++) {
1098 for (j=0; j < width; j++) {
1099 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1100 }
1101 src += stride;
1102 dst += stride;
1103 }
1104 }
1105
1106 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1107 int i,j;
1108 for (i=0; i < height; i++) {
1109 for (j=0; j < width; j++) {
1110 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1111 }
1112 src += stride;
1113 dst += stride;
1114 }
1115 }
1116
1117 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1118 int i,j;
1119 for (i=0; i < height; i++) {
1120 for (j=0; j < width; j++) {
1121 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1122 }
1123 src += stride;
1124 dst += stride;
1125 }
1126 }
1127
1128 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1129 int i,j;
1130 for (i=0; i < height; i++) {
1131 for (j=0; j < width; j++) {
1132 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1133 }
1134 src += stride;
1135 dst += stride;
1136 }
1137 }
1138 #if 0
1139 #define TPEL_WIDTH(width)\
1140 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1141 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1142 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1143 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1144 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1145 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1146 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1147 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1148 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1149 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1150 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1151 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1152 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1153 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1154 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1155 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1156 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1157 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1158 #endif
1159
1160 #define H264_CHROMA_MC(OPNAME, OP)\
1161 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1162 const int A=(8-x)*(8-y);\
1163 const int B=( x)*(8-y);\
1164 const int C=(8-x)*( y);\
1165 const int D=( x)*( y);\
1166 int i;\
1167 \
1168 assert(x<8 && y<8 && x>=0 && y>=0);\
1169 \
1170 for(i=0; i<h; i++)\
1171 {\
1172 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1173 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1174 dst+= stride;\
1175 src+= stride;\
1176 }\
1177 }\
1178 \
1179 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1180 const int A=(8-x)*(8-y);\
1181 const int B=( x)*(8-y);\
1182 const int C=(8-x)*( y);\
1183 const int D=( x)*( y);\
1184 int i;\
1185 \
1186 assert(x<8 && y<8 && x>=0 && y>=0);\
1187 \
1188 for(i=0; i<h; i++)\
1189 {\
1190 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1191 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1192 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1193 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1194 dst+= stride;\
1195 src+= stride;\
1196 }\
1197 }\
1198 \
1199 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1200 const int A=(8-x)*(8-y);\
1201 const int B=( x)*(8-y);\
1202 const int C=(8-x)*( y);\
1203 const int D=( x)*( y);\
1204 int i;\
1205 \
1206 assert(x<8 && y<8 && x>=0 && y>=0);\
1207 \
1208 for(i=0; i<h; i++)\
1209 {\
1210 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1211 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1212 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1213 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1214 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1215 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1216 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1217 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1218 dst+= stride;\
1219 src+= stride;\
1220 }\
1221 }
1222
1223 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1224 #define op_put(a, b) a = (((b) + 32)>>6)
1225
1226 H264_CHROMA_MC(put_ , op_put)
1227 H264_CHROMA_MC(avg_ , op_avg)
1228 #undef op_avg
1229 #undef op_put
1230
1231 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1232 {
1233 int i;
1234 for(i=0; i<h; i++)
1235 {
1236 ST32(dst , LD32(src ));
1237 dst+=dstStride;
1238 src+=srcStride;
1239 }
1240 }
1241
1242 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1243 {
1244 int i;
1245 for(i=0; i<h; i++)
1246 {
1247 ST32(dst , LD32(src ));
1248 ST32(dst+4 , LD32(src+4 ));
1249 dst+=dstStride;
1250 src+=srcStride;
1251 }
1252 }
1253
1254 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1255 {
1256 int i;
1257 for(i=0; i<h; i++)
1258 {
1259 ST32(dst , LD32(src ));
1260 ST32(dst+4 , LD32(src+4 ));
1261 ST32(dst+8 , LD32(src+8 ));
1262 ST32(dst+12, LD32(src+12));
1263 dst+=dstStride;
1264 src+=srcStride;
1265 }
1266 }
1267
1268 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1269 {
1270 int i;
1271 for(i=0; i<h; i++)
1272 {
1273 ST32(dst , LD32(src ));
1274 ST32(dst+4 , LD32(src+4 ));
1275 ST32(dst+8 , LD32(src+8 ));
1276 ST32(dst+12, LD32(src+12));
1277 dst[16]= src[16];
1278 dst+=dstStride;
1279 src+=srcStride;
1280 }
1281 }
1282
1283 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1284 {
1285 int i;
1286 for(i=0; i<h; i++)
1287 {
1288 ST32(dst , LD32(src ));
1289 ST32(dst+4 , LD32(src+4 ));
1290 dst[8]= src[8];
1291 dst+=dstStride;
1292 src+=srcStride;
1293 }
1294 }
1295
1296
1297 #define QPEL_MC(r, OPNAME, RND, OP) \
1298 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1299 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1300 int i;\
1301 for(i=0; i<h; i++)\
1302 {\
1303 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1304 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1305 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1306 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1307 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1308 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1309 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1310 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1311 dst+=dstStride;\
1312 src+=srcStride;\
1313 }\
1314 }\
1315 \
1316 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1317 const int w=8;\
1318 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1319 int i;\
1320 for(i=0; i<w; i++)\
1321 {\
1322 const int src0= src[0*srcStride];\
1323 const int src1= src[1*srcStride];\
1324 const int src2= src[2*srcStride];\
1325 const int src3= src[3*srcStride];\
1326 const int src4= src[4*srcStride];\
1327 const int src5= src[5*srcStride];\
1328 const int src6= src[6*srcStride];\
1329 const int src7= src[7*srcStride];\
1330 const int src8= src[8*srcStride];\
1331 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1332 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1333 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1334 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1335 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1336 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1337 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1338 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1339 dst++;\
1340 src++;\
1341 }\
1342 }\
1343 \
1344 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1345 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1346 int i;\
1347 \
1348 for(i=0; i<h; i++)\
1349 {\
1350 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1351 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1352 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1353 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1354 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1355 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1356 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1357 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1358 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1359 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1360 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1361 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1362 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1363 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1364 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1365 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1366 dst+=dstStride;\
1367 src+=srcStride;\
1368 }\
1369 }\
1370 \
1371 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1372 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1373 int i;\
1374 const int w=16;\
1375 for(i=0; i<w; i++)\
1376 {\
1377 const int src0= src[0*srcStride];\
1378 const int src1= src[1*srcStride];\
1379 const int src2= src[2*srcStride];\
1380 const int src3= src[3*srcStride];\
1381 const int src4= src[4*srcStride];\
1382 const int src5= src[5*srcStride];\
1383 const int src6= src[6*srcStride];\
1384 const int src7= src[7*srcStride];\
1385 const int src8= src[8*srcStride];\
1386 const int src9= src[9*srcStride];\
1387 const int src10= src[10*srcStride];\
1388 const int src11= src[11*srcStride];\
1389 const int src12= src[12*srcStride];\
1390 const int src13= src[13*srcStride];\
1391 const int src14= src[14*srcStride];\
1392 const int src15= src[15*srcStride];\
1393 const int src16= src[16*srcStride];\
1394 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1395 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1396 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1397 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1398 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1399 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1400 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1401 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1402 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1403 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1404 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1405 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1406 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1407 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1408 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1409 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1410 dst++;\
1411 src++;\
1412 }\
1413 }\
1414 \
1415 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1416 OPNAME ## pixels8_c(dst, src, stride, 8);\
1417 }\
1418 \
1419 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1420 uint8_t half[64];\
1421 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1422 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1423 }\
1424 \
1425 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1426 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1427 }\
1428 \
1429 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1430 uint8_t half[64];\
1431 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1432 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1433 }\
1434 \
1435 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1436 uint8_t full[16*9];\
1437 uint8_t half[64];\
1438 copy_block9(full, src, 16, stride, 9);\
1439 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1440 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1441 }\
1442 \
1443 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1444 uint8_t full[16*9];\
1445 copy_block9(full, src, 16, stride, 9);\
1446 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1447 }\
1448 \
1449 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1450 uint8_t full[16*9];\
1451 uint8_t half[64];\
1452 copy_block9(full, src, 16, stride, 9);\
1453 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1454 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1455 }\
1456 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1457 uint8_t full[16*9];\
1458 uint8_t halfH[72];\
1459 uint8_t halfV[64];\
1460 uint8_t halfHV[64];\
1461 copy_block9(full, src, 16, stride, 9);\
1462 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1463 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1464 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1465 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1466 }\
1467 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1468 uint8_t full[16*9];\
1469 uint8_t halfH[72];\
1470 uint8_t halfHV[64];\
1471 copy_block9(full, src, 16, stride, 9);\
1472 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1473 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1474 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1475 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1476 }\
1477 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1478 uint8_t full[16*9];\
1479 uint8_t halfH[72];\
1480 uint8_t halfV[64];\
1481 uint8_t halfHV[64];\
1482 copy_block9(full, src, 16, stride, 9);\
1483 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1484 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1485 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1486 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1487 }\
1488 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1489 uint8_t full[16*9];\
1490 uint8_t halfH[72];\
1491 uint8_t halfHV[64];\
1492 copy_block9(full, src, 16, stride, 9);\
1493 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1494 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1495 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1496 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1497 }\
1498 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[16*9];\
1500 uint8_t halfH[72];\
1501 uint8_t halfV[64];\
1502 uint8_t halfHV[64];\
1503 copy_block9(full, src, 16, stride, 9);\
1504 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1508 }\
1509 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1510 uint8_t full[16*9];\
1511 uint8_t halfH[72];\
1512 uint8_t halfHV[64];\
1513 copy_block9(full, src, 16, stride, 9);\
1514 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1518 }\
1519 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520 uint8_t full[16*9];\
1521 uint8_t halfH[72];\
1522 uint8_t halfV[64];\
1523 uint8_t halfHV[64];\
1524 copy_block9(full, src, 16, stride, 9);\
1525 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1526 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1529 }\
1530 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1531 uint8_t full[16*9];\
1532 uint8_t halfH[72];\
1533 uint8_t halfHV[64];\
1534 copy_block9(full, src, 16, stride, 9);\
1535 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1539 }\
1540 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1541 uint8_t halfH[72];\
1542 uint8_t halfHV[64];\
1543 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1544 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1545 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1546 }\
1547 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1548 uint8_t halfH[72];\
1549 uint8_t halfHV[64];\
1550 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1551 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1552 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1553 }\
1554 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1555 uint8_t full[16*9];\
1556 uint8_t halfH[72];\
1557 uint8_t halfV[64];\
1558 uint8_t halfHV[64];\
1559 copy_block9(full, src, 16, stride, 9);\
1560 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1561 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1562 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1563 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1564 }\
1565 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1566 uint8_t full[16*9];\
1567 uint8_t halfH[72];\
1568 copy_block9(full, src, 16, stride, 9);\
1569 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1570 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1571 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1572 }\
1573 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1574 uint8_t full[16*9];\
1575 uint8_t halfH[72];\
1576 uint8_t halfV[64];\
1577 uint8_t halfHV[64];\
1578 copy_block9(full, src, 16, stride, 9);\
1579 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1580 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1581 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1582 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1583 }\
1584 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1585 uint8_t full[16*9];\
1586 uint8_t halfH[72];\
1587 copy_block9(full, src, 16, stride, 9);\
1588 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1589 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1590 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1591 }\
1592 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1593 uint8_t halfH[72];\
1594 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1595 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1596 }\
1597 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1598 OPNAME ## pixels16_c(dst, src, stride, 16);\
1599 }\
1600 \
1601 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1602 uint8_t half[256];\
1603 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1604 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1605 }\
1606 \
1607 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1608 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1609 }\
1610 \
1611 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1612 uint8_t half[256];\
1613 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1614 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1615 }\
1616 \
1617 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1618 uint8_t full[24*17];\
1619 uint8_t half[256];\
1620 copy_block17(full, src, 24, stride, 17);\
1621 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1622 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1623 }\
1624 \
1625 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1626 uint8_t full[24*17];\
1627 copy_block17(full, src, 24, stride, 17);\
1628 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1629 }\
1630 \
1631 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1632 uint8_t full[24*17];\
1633 uint8_t half[256];\
1634 copy_block17(full, src, 24, stride, 17);\
1635 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1636 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1637 }\
1638 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1639 uint8_t full[24*17];\
1640 uint8_t halfH[272];\
1641 uint8_t halfV[256];\
1642 uint8_t halfHV[256];\
1643 copy_block17(full, src, 24, stride, 17);\
1644 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1645 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1646 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1647 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1648 }\
1649 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1650 uint8_t full[24*17];\
1651 uint8_t halfH[272];\
1652 uint8_t halfHV[256];\
1653 copy_block17(full, src, 24, stride, 17);\
1654 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1655 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1656 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1657 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1658 }\
1659 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1660 uint8_t full[24*17];\
1661 uint8_t halfH[272];\
1662 uint8_t halfV[256];\
1663 uint8_t halfHV[256];\
1664 copy_block17(full, src, 24, stride, 17);\
1665 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1666 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1667 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1668 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1669 }\
1670 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1671 uint8_t full[24*17];\
1672 uint8_t halfH[272];\
1673 uint8_t halfHV[256];\
1674 copy_block17(full, src, 24, stride, 17);\
1675 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1676 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1677 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1678 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1679 }\
1680 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[24*17];\
1682 uint8_t halfH[272];\
1683 uint8_t halfV[256];\
1684 uint8_t halfHV[256];\
1685 copy_block17(full, src, 24, stride, 17);\
1686 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690 }\
1691 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1692 uint8_t full[24*17];\
1693 uint8_t halfH[272];\
1694 uint8_t halfHV[256];\
1695 copy_block17(full, src, 24, stride, 17);\
1696 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1700 }\
1701 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702 uint8_t full[24*17];\
1703 uint8_t halfH[272];\
1704 uint8_t halfV[256];\
1705 uint8_t halfHV[256];\
1706 copy_block17(full, src, 24, stride, 17);\
1707 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1708 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711 }\
1712 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[24*17];\
1714 uint8_t halfH[272];\
1715 uint8_t halfHV[256];\
1716 copy_block17(full, src, 24, stride, 17);\
1717 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1721 }\
1722 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1723 uint8_t halfH[272];\
1724 uint8_t halfHV[256];\
1725 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1726 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1727 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1728 }\
1729 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1730 uint8_t halfH[272];\
1731 uint8_t halfHV[256];\
1732 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1733 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1734 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1735 }\
1736 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1737 uint8_t full[24*17];\
1738 uint8_t halfH[272];\
1739 uint8_t halfV[256];\
1740 uint8_t halfHV[256];\
1741 copy_block17(full, src, 24, stride, 17);\
1742 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1743 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1744 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1745 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1746 }\
1747 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1748 uint8_t full[24*17];\
1749 uint8_t halfH[272];\
1750 copy_block17(full, src, 24, stride, 17);\
1751 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1752 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1753 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1754 }\
1755 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1756 uint8_t full[24*17];\
1757 uint8_t halfH[272];\
1758 uint8_t halfV[256];\
1759 uint8_t halfHV[256];\
1760 copy_block17(full, src, 24, stride, 17);\
1761 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1762 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1763 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1764 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1765 }\
1766 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1767 uint8_t full[24*17];\
1768 uint8_t halfH[272];\
1769 copy_block17(full, src, 24, stride, 17);\
1770 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1771 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1772 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1773 }\
1774 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1775 uint8_t halfH[272];\
1776 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1777 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1778 }
1779
1780 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1781 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1782 #define op_put(a, b) a = cm[((b) + 16)>>5]
1783 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1784
1785 QPEL_MC(0, put_ , _ , op_put)
1786 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1787 QPEL_MC(0, avg_ , _ , op_avg)
1788 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1789 #undef op_avg
1790 #undef op_avg_no_rnd
1791 #undef op_put
1792 #undef op_put_no_rnd
1793
1794 #if 1
1795 #define H264_LOWPASS(OPNAME, OP, OP2) \
1796 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1797 const int h=4;\
1798 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1799 int i;\
1800 for(i=0; i<h; i++)\
1801 {\
1802 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1803 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1804 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1805 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1806 dst+=dstStride;\
1807 src+=srcStride;\
1808 }\
1809 }\
1810 \
1811 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1812 const int w=4;\
1813 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1814 int i;\
1815 for(i=0; i<w; i++)\
1816 {\
1817 const int srcB= src[-2*srcStride];\
1818 const int srcA= src[-1*srcStride];\
1819 const int src0= src[0 *srcStride];\
1820 const int src1= src[1 *srcStride];\
1821 const int src2= src[2 *srcStride];\
1822 const int src3= src[3 *srcStride];\
1823 const int src4= src[4 *srcStride];\
1824 const int src5= src[5 *srcStride];\
1825 const int src6= src[6 *srcStride];\
1826 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1827 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1828 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1829 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1830 dst++;\
1831 src++;\
1832 }\
1833 }\
1834 \
1835 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1836 const int h=4;\
1837 const int w=4;\
1838 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1839 int i;\
1840 src -= 2*srcStride;\
1841 for(i=0; i<h+5; i++)\
1842 {\
1843 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1844 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1845 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1846 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1847 tmp+=tmpStride;\
1848 src+=srcStride;\
1849 }\
1850 tmp -= tmpStride*(h+5-2);\
1851 for(i=0; i<w; i++)\
1852 {\
1853 const int tmpB= tmp[-2*tmpStride];\
1854 const int tmpA= tmp[-1*tmpStride];\
1855 const int tmp0= tmp[0 *tmpStride];\
1856 const int tmp1= tmp[1 *tmpStride];\
1857 const int tmp2= tmp[2 *tmpStride];\
1858 const int tmp3= tmp[3 *tmpStride];\
1859 const int tmp4= tmp[4 *tmpStride];\
1860 const int tmp5= tmp[5 *tmpStride];\
1861 const int tmp6= tmp[6 *tmpStride];\
1862 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1863 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1864 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1865 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1866 dst++;\
1867 tmp++;\
1868 }\
1869 }\
1870 \
1871 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1872 const int h=8;\
1873 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1874 int i;\
1875 for(i=0; i<h; i++)\
1876 {\
1877 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1878 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1879 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1880 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1881 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1882 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1883 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1884 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1885 dst+=dstStride;\
1886 src+=srcStride;\
1887 }\
1888 }\
1889 \
1890 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1891 const int w=8;\
1892 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1893 int i;\
1894 for(i=0; i<w; i++)\
1895 {\
1896 const int srcB= src[-2*srcStride];\
1897 const int srcA= src[-1*srcStride];\
1898 const int src0= src[0 *srcStride];\
1899 const int src1= src[1 *srcStride];\
1900 const int src2= src[2 *srcStride];\
1901 const int src3= src[3 *srcStride];\
1902 const int src4= src[4 *srcStride];\
1903 const int src5= src[5 *srcStride];\
1904 const int src6= src[6 *srcStride];\
1905 const int src7= src[7 *srcStride];\
1906 const int src8= src[8 *srcStride];\
1907 const int src9= src[9 *srcStride];\
1908 const int src10=src[10*srcStride];\
1909 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1910 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1911 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1912 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1913 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1914 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1915 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1916 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1917 dst++;\
1918 src++;\
1919 }\
1920 }\
1921 \
1922 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1923 const int h=8;\
1924 const int w=8;\
1925 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1926 int i;\
1927 src -= 2*srcStride;\
1928 for(i=0; i<h+5; i++)\
1929 {\
1930 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1931 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1932 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1933 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1934 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1935 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1936 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1937 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1938 tmp+=tmpStride;\
1939 src+=srcStride;\
1940 }\
1941 tmp -= tmpStride*(h+5-2);\
1942 for(i=0; i<w; i++)\
1943 {\
1944 const int tmpB= tmp[-2*tmpStride];\
1945 const int tmpA= tmp[-1*tmpStride];\
1946 const int tmp0= tmp[0 *tmpStride];\
1947 const int tmp1= tmp[1 *tmpStride];\
1948 const int tmp2= tmp[2 *tmpStride];\
1949 const int tmp3= tmp[3 *tmpStride];\
1950 const int tmp4= tmp[4 *tmpStride];\
1951 const int tmp5= tmp[5 *tmpStride];\
1952 const int tmp6= tmp[6 *tmpStride];\
1953 const int tmp7= tmp[7 *tmpStride];\
1954 const int tmp8= tmp[8 *tmpStride];\
1955 const int tmp9= tmp[9 *tmpStride];\
1956 const int tmp10=tmp[10*tmpStride];\
1957 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1958 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1959 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1960 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1961 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1962 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1963 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1964 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1965 dst++;\
1966 tmp++;\
1967 }\
1968 }\
1969 \
1970 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1971 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1972 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1973 src += 8*srcStride;\
1974 dst += 8*dstStride;\
1975 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1976 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1977 }\
1978 \
1979 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1980 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1981 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1982 src += 8*srcStride;\
1983 dst += 8*dstStride;\
1984 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1985 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1986 }\
1987 \
1988 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1989 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1990 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1991 src += 8*srcStride;\
1992 tmp += 8*tmpStride;\
1993 dst += 8*dstStride;\
1994 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1995 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1996 }\
1997
1998 #define H264_MC(OPNAME, SIZE) \
1999 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2000 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2001 }\
2002 \
2003 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2004 uint8_t half[SIZE*SIZE];\
2005 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2006 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2007 }\
2008 \
2009 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2010 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2011 }\
2012 \
2013 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2014 uint8_t half[SIZE*SIZE];\
2015 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2016 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2017 }\
2018 \
2019 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2020 uint8_t full[SIZE*(SIZE+5)];\
2021 uint8_t * const full_mid= full + SIZE*2;\
2022 uint8_t half[SIZE*SIZE];\
2023 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2024 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2025 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2026 }\
2027 \
2028 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2029 uint8_t full[SIZE*(SIZE+5)];\
2030 uint8_t * const full_mid= full + SIZE*2;\
2031 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2032 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2033 }\
2034 \
2035 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2036 uint8_t full[SIZE*(SIZE+5)];\
2037 uint8_t * const full_mid= full + SIZE*2;\
2038 uint8_t half[SIZE*SIZE];\
2039 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2040 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2041 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2042 }\
2043 \
2044 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2045 uint8_t full[SIZE*(SIZE+5)];\
2046 uint8_t * const full_mid= full + SIZE*2;\
2047 uint8_t halfH[SIZE*SIZE];\
2048 uint8_t halfV[SIZE*SIZE];\
2049 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2050 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2051 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2052 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2053 }\
2054 \
2055 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2056 uint8_t full[SIZE*(SIZE+5)];\
2057 uint8_t * const full_mid= full + SIZE*2;\
2058 uint8_t halfH[SIZE*SIZE];\
2059 uint8_t halfV[SIZE*SIZE];\
2060 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2061 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2062 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2063 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2064 }\
2065 \
2066 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2067 uint8_t full[SIZE*(SIZE+5)];\
2068 uint8_t * const full_mid= full + SIZE*2;\
2069 uint8_t halfH[SIZE*SIZE];\
2070 uint8_t halfV[SIZE*SIZE];\
2071 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2072 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2073 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2074 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2075 }\
2076 \
2077 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2078 uint8_t full[SIZE*(SIZE+5)];\
2079 uint8_t * const full_mid= full + SIZE*2;\
2080 uint8_t halfH[SIZE*SIZE];\
2081 uint8_t halfV[SIZE*SIZE];\
2082 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2083 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2084 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2085 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2086 }\
2087 \
2088 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2089 int16_t tmp[SIZE*(SIZE+5)];\
2090 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2091 }\
2092 \
2093 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2094 int16_t tmp[SIZE*(SIZE+5)];\
2095 uint8_t halfH[SIZE*SIZE];\
2096 uint8_t halfHV[SIZE*SIZE];\
2097 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2098 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2099 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2100 }\
2101 \
2102 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2103 int16_t tmp[SIZE*(SIZE+5)];\
2104 uint8_t halfH[SIZE*SIZE];\
2105 uint8_t halfHV[SIZE*SIZE];\
2106 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2107 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2108 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2109 }\
2110 \
2111 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2112 uint8_t full[SIZE*(SIZE+5)];\
2113 uint8_t * const full_mid= full + SIZE*2;\
2114 int16_t tmp[SIZE*(SIZE+5)];\
2115 uint8_t halfV[SIZE*SIZE];\
2116 uint8_t halfHV[SIZE*SIZE];\
2117 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2118 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2119 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2120 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2121 }\
2122 \
2123 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2124 uint8_t full[SIZE*(SIZE+5)];\
2125 uint8_t * const full_mid= full + SIZE*2;\
2126 int16_t tmp[SIZE*(SIZE+5)];\
2127 uint8_t halfV[SIZE*SIZE];\
2128 uint8_t halfHV[SIZE*SIZE];\
2129 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2130 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2131 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2132 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2133 }\
2134
2135 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2136 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2137 #define op_put(a, b) a = cm[((b) + 16)>>5]
2138 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2139 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2140
2141 H264_LOWPASS(put_ , op_put, op2_put)
2142 H264_LOWPASS(avg_ , op_avg, op2_avg)
2143 H264_MC(put_, 4)
2144 H264_MC(put_, 8)
2145 H264_MC(put_, 16)
2146 H264_MC(avg_, 4)
2147 H264_MC(avg_, 8)
2148 H264_MC(avg_, 16)
2149
2150 #undef op_avg
2151 #undef op_put
2152 #undef op2_avg
2153 #undef op2_put
2154 #endif
2155
2156 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2157 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2158 int i;
2159
2160 for(i=0; i<h; i++){
2161 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2162 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2163 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2164 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2165 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2166 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2167 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2168 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2169 dst+=dstStride;
2170 src+=srcStride;
2171 }
2172 }
2173
2174 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2175 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2176 int i;
2177
2178 for(i=0; i<w; i++){
2179 const int src_1= src[ -srcStride];
2180 const int src0 = src[0 ];
2181 const int src1 = src[ srcStride];
2182 const int src2 = src[2*srcStride];
2183 const int src3 = src[3*srcStride];
2184 const int src4 = src[4*srcStride];
2185 const int src5 = src[5*srcStride];
2186 const int src6 = src[6*srcStride];
2187 const int src7 = src[7*srcStride];
2188 const int src8 = src[8*srcStride];
2189 const int src9 = src[9*srcStride];
2190 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2191 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2192 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2193 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2194 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2195 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2196 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2197 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2198 src++;
2199 dst++;
2200 }
2201 }
2202
2203 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2204 put_pixels8_c(dst, src, stride, 8);
2205 }
2206
2207 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2208 uint8_t half[64];
2209 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2210 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2211 }
2212
2213 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2214 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2215 }
2216
2217 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2218 uint8_t half[64];
2219 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2220 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2221 }
2222
2223 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2224 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2225 }
2226
2227 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2228 uint8_t halfH[88];
2229 uint8_t halfV[64];
2230 uint8_t halfHV[64];
2231 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2232 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2233 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2234 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2235 }
2236 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2237 uint8_t halfH[88];
2238 uint8_t halfV[64];
2239 uint8_t halfHV[64];
2240 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2241 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2242 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2243 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2244 }
2245 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2246 uint8_t halfH[88];
2247 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2248 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2249 }
2250
2251
2252 static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2253 {
2254 int s, i;
2255
2256 s = 0;
2257 for(i=0;i<16;i++) {
2258 s += abs(pix1[0] - pix2[0]);
2259 s += abs(pix1[1] - pix2[1]);
2260 s += abs(pix1[2] - pix2[2]);
2261 s += abs(pix1[3] - pix2[3]);
2262 s += abs(pix1[4] - pix2[4]);
2263 s += abs(pix1[5] - pix2[5]);
2264 s += abs(pix1[6] - pix2[6]);
2265 s += abs(pix1[7] - pix2[7]);
2266 s += abs(pix1[8] - pix2[8]);
2267 s += abs(pix1[9] - pix2[9]);
2268 s += abs(pix1[10] - pix2[10]);
2269 s += abs(pix1[11] - pix2[11]);
2270 s += abs(pix1[12] - pix2[12]);
2271 s += abs(pix1[13] - pix2[13]);
2272 s += abs(pix1[14] - pix2[14]);
2273 s += abs(pix1[15] - pix2[15]);
2274 pix1 += line_size;
2275 pix2 += line_size;
2276 }
2277 return s;
2278 }
2279
2280 static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2281 {
2282 int s, i;
2283
2284 s = 0;
2285 for(i=0;i<16;i++) {
2286 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2287 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2288 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2289 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2290 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2291 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2292 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2293 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2294 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2295 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2296 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2297 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2298 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2299 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2300 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2301 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2302 pix1 += line_size;
2303 pix2 += line_size;
2304 }
2305 return s;
2306 }
2307
2308 static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2309 {
2310 int s, i;
2311 uint8_t *pix3 = pix2 + line_size;
2312
2313 s = 0;
2314 for(i=0;i<16;i++) {
2315 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2316 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2317 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2318 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2319 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2320 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2321 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2322 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2323 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2324 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2325 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2326 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2327 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2328 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2329 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2330 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2331 pix1 += line_size;
2332 pix2 += line_size;
2333 pix3 += line_size;
2334 }
2335 return s;
2336 }
2337
2338 static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2339 {
2340 int s, i;
2341 uint8_t *pix3 = pix2 + line_size;
2342
2343 s = 0;
2344 for(i=0;i<16;i++) {
2345 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2346 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2347 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2348 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2349 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2350 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2351 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2352 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2353 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2354 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2355 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2356 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2357 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2358 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2359 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2360 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2361 pix1 += line_size;
2362 pix2 += line_size;
2363 pix3 += line_size;
2364 }
2365 return s;
2366 }
2367
2368 static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2369 {
2370 int s, i;
2371
2372 s = 0;
2373 for(i=0;i<8;i++) {
2374 s += abs(pix1[0] - pix2[0]);
2375 s += abs(pix1[1] - pix2[1]);
2376 s += abs(pix1[2] - pix2[2]);
2377 s += abs(pix1[3] - pix2[3]);
2378 s += abs(pix1[4] - pix2[4]);
2379 s += abs(pix1[5] - pix2[5]);
2380 s += abs(pix1[6] - pix2[6]);
2381 s += abs(pix1[7] - pix2[7]);
2382 pix1 += line_size;
2383 pix2 += line_size;
2384 }
2385 return s;
2386 }
2387
2388 static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2389 {
2390 int s, i;
2391
2392 s = 0;
2393 for(i=0;i<8;i++) {
2394 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2395 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2396 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2397 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2398 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2399 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2400 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2401 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2402 pix1 += line_size;
2403 pix2 += line_size;
2404 }
2405 return s;
2406 }
2407
2408 static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2409 {
2410 int s, i;
2411 uint8_t *pix3 = pix2 + line_size;
2412
2413 s = 0;
2414 for(i=0;i<8;i++) {
2415 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2416 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2417 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2418 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2419 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2420 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2421 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2422 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2423 pix1 += line_size;
2424 pix2 += line_size;
2425 pix3 += line_size;
2426 }
2427 return s;
2428 }
2429
2430 static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2431 {
2432 int s, i;
2433 uint8_t *pix3 = pix2 + line_size;
2434
2435 s = 0;
2436 for(i=0;i<8;i++) {
2437 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2438 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2439 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2440 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2441 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2442 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2443 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2444 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2445 pix1 += line_size;
2446 pix2 += line_size;
2447 pix3 += line_size;
2448 }
2449 return s;
2450 }
2451
2452 static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2453 return pix_abs16x16_c(a,b,stride);
2454 }
2455
2456 static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2457 return pix_abs8x8_c(a,b,stride);
2458 }
2459
2460 /**
2461 * permutes an 8x8 block.
2462 * @param block the block which will be permuted according to the given permutation vector
2463 * @param permutation the permutation vector
2464 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2465 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2466 * (inverse) permutated to scantable order!
2467 */
2468 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2469 {
2470 int i;
2471 DCTELEM temp[64];
2472
2473 if(last<=0) return;
2474 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2475
2476 for(i=0; i<=last; i++){
2477 const int j= scantable[i];
2478 temp[j]= block[j];
2479 block[j]=0;
2480 }
2481
2482 for(i=0; i<=last; i++){
2483 const int j= scantable[i];
2484 const int perm_j= permutation[j];
2485 block[perm_j]= temp[j];
2486 }
2487 }
2488
2489 /**
2490 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2491 */
2492 static void clear_blocks_c(DCTELEM *blocks)
2493 {
2494 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2495 }
2496
2497 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2498 int i;
2499 for(i=0; i+7<w; i+=8){
2500 dst[i+0] += src[i+0];
2501 dst[i+1] += src[i+1];
2502 dst[i+2] += src[i+2];
2503 dst[i+3] += src[i+3];
2504 dst[i+4] += src[i+4];
2505 dst[i+5] += src[i+5];
2506 dst[i+6] += src[i+6];
2507 dst[i+7] += src[i+7];
2508 }
2509 for(; i<w; i++)
2510 dst[i+0] += src[i+0];
2511 }
2512
2513 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2514 int i;
2515 for(i=0; i+7<w; i+=8){
2516 dst[i+0] = src1[i+0]-src2[i+0];
2517 dst[i+1] = src1[i+1]-src2[i+1];
2518 dst[i+2] = src1[i+2]-src2[i+2];
2519 dst[i+3] = src1[i+3]-src2[i+3];
2520 dst[i+4] = src1[i+4]-src2[i+4];
2521 dst[i+5] = src1[i+5]-src2[i+5];
2522 dst[i+6] = src1[i+6]-src2[i+6];
2523 dst[i+7] = src1[i+7]-src2[i+7];
2524 }
2525 for(; i<w; i++)
2526 dst[i+0] = src1[i+0]-src2[i+0];
2527 }
2528
2529 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2530 int i;
2531 uint8_t l, lt;
2532
2533 l= *left;
2534 lt= *left_top;
2535
2536 for(i=0; i<w; i++){
2537 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2538 lt= src1[i];
2539 l= src2[i];
2540 dst[i]= l - pred;
2541 }
2542
2543 *left= l;
2544 *left_top= lt;
2545 }
2546
2547 #define BUTTERFLY2(o1,o2,i1,i2) \
2548 o1= (i1)+(i2);\
2549 o2= (i1)-(i2);
2550
2551 #define BUTTERFLY1(x,y) \
2552 {\
2553 int a,b;\
2554 a= x;\
2555 b= y;\
2556 x= a+b;\
2557 y= a-b;\
2558 }
2559
2560 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2561
2562 static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2563 int i;
2564 int temp[64];
2565 int sum=0;
2566
2567 for(i=0; i<8; i++){
2568 //FIXME try pointer walks
2569 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2570 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2571 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2572 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2573
2574 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2575 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2576 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2577 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2578
2579 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2580 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2581 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2582 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2583 }
2584
2585 for(i=0; i<8; i++){
2586 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2587 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2588 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2589 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2590
2591 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2592 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2593 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2594 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2595
2596 sum +=
2597 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2598 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2599 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2600 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2601 }
2602 #if 0
2603 static int maxi=0;
2604 if(sum>maxi){
2605 maxi=sum;
2606 printf("MAX:%d\n", maxi);
2607 }
2608 #endif
2609 return sum;
2610 }
2611
2612 static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2613 int i;
2614 int temp[64];
2615 int sum=0;
2616 //FIXME OOOPS ignore 0 term instead of mean mess
2617 for(i=0; i<8; i++){
2618 //FIXME try pointer walks
2619 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2620 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2621 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2622 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2623
2624 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2625 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2626 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2627 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2628
2629 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2630 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2631 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2632 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2633 }
2634
2635 for(i=0; i<8; i++){
2636 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2637 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2638 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2639 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2640
2641 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2642 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2643 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2644 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2645
2646 sum +=
2647 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2648 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2649 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2650 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2651 }
2652
2653 return sum;
2654 }
2655
2656 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2657 MpegEncContext * const s= (MpegEncContext *)c;
2658 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2659 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2660 int sum=0, i;
2661
2662 s->dsp.diff_pixels(temp, src1, src2, stride);
2663 s->dsp.fdct(temp);
2664
2665 for(i=0; i<64; i++)
2666 sum+= ABS(temp[i]);
2667
2668 return sum;
2669 }
2670
2671 void simple_idct(DCTELEM *block); //FIXME
2672
2673 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2674 MpegEncContext * const s= (MpegEncContext *)c;
2675 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2676 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2677 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2678 int sum=0, i;
2679
2680 s->mb_intra=0;
2681
2682 s->dsp.diff_pixels(temp, src1, src2, stride);
2683
2684 memcpy(bak, temp, 64*sizeof(DCTELEM));
2685
2686 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2687 s->dct_unquantize(s, temp, 0, s->qscale);
2688 simple_idct(temp); //FIXME
2689
2690 for(i=0; i<64; i++)
2691 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2692
2693 return sum;
2694 }
2695
2696 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2697 MpegEncContext * const s= (MpegEncContext *)c;
2698 const uint8_t *scantable= s->intra_scantable.permutated;
2699 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2700 uint64_t __align8 aligned_bak[stride];
2701 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2702 uint8_t * const bak= (uint8_t*)aligned_bak;
2703 int i, last, run, bits, level, distoration, start_i;
2704 const int esc_length= s->ac_esc_length;
2705 uint8_t * length;
2706 uint8_t * last_length;
2707
2708 for(i=0; i<8; i++){
2709 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2710 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2711 }
2712
2713 s->dsp.diff_pixels(temp, src1, src2, stride);
2714
2715 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2716
2717 bits=0;
2718
2719 if (s->mb_intra) {
2720 start_i = 1;
2721 length = s->intra_ac_vlc_length;
2722 last_length= s->intra_ac_vlc_last_length;
2723 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2724 } else {
2725 start_i = 0;
2726 length = s->inter_ac_vlc_length;
2727 last_length= s->inter_ac_vlc_last_length;
2728 }
2729
2730 if(last>=start_i){
2731 run=0;
2732 for(i=start_i; i<last; i++){
2733 int j= scantable[i];
2734 level= temp[j];
2735
2736 if(level){
2737 level+=64;
2738 if((level&(~127)) == 0){
2739 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2740 }else
2741 bits+= esc_length;
2742 run=0;
2743 }else
2744 run++;
2745 }
2746 i= scantable[last];
2747
2748 level= temp[i] + 64;
2749
2750 assert(level - 64);
2751
2752 if((level&(~127)) == 0){
2753 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2754 }else
2755 bits+= esc_length;
2756
2757 }
2758
2759 if(last>=0){
2760 s->dct_unquantize(s, temp, 0, s->qscale);
2761 }
2762
2763 s->dsp.idct_add(bak, stride, temp);
2764
2765 distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2766
2767 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2768 }
2769
2770 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2771 MpegEncContext * const s= (MpegEncContext *)c;
2772 const uint8_t *scantable= s->intra_scantable.permutated;
2773 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2774 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2775 int i, last, run, bits, level, start_i;
2776 const int esc_length= s->ac_esc_length;
2777 uint8_t * length;
2778 uint8_t * last_length;
2779
2780 s->dsp.diff_pixels(temp, src1, src2, stride);
2781
2782 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2783
2784 bits=0;
2785
2786 if (s->mb_intra) {
2787 start_i = 1;
2788 length = s->intra_ac_vlc_length;
2789 last_length= s->intra_ac_vlc_last_length;
2790 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2791 } else {
2792 start_i = 0;
2793 length = s->inter_ac_vlc_length;
2794 last_length= s->inter_ac_vlc_last_length;
2795 }
2796
2797 if(last>=start_i){
2798 run=0;
2799 for(i=start_i; i<last; i++){
2800 int j= scantable[i];
2801 level= temp[j];
2802
2803 if(level){
2804 level+=64;
2805 if((level&(~127)) == 0){
2806 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2807 }else
2808 bits+= esc_length;
2809 run=0;
2810 }else
2811 run++;
2812 }
2813 i= scantable[last];
2814
2815 level= temp[i] + 64;
2816
2817 assert(level - 64);
2818
2819 if((level&(~127)) == 0){
2820 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2821 }else
2822 bits+= esc_length;
2823 }
2824
2825 return bits;
2826 }
2827
2828
2829 WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2830 WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2831 WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2832 WARPER88_1616(rd8x8_c, rd16x16_c)
2833 WARPER88_1616(bit8x8_c, bit16x16_c)
2834
2835 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2836 converted */
2837 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2838 {
2839 j_rev_dct (block);
2840 put_pixels_clamped_c(block, dest, line_size);
2841 }
2842 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2843 {
2844 j_rev_dct (block);
2845 add_pixels_clamped_c(block, dest, line_size);
2846 }
2847
2848 /* init static data */
2849 void dsputil_static_init(void)
2850 {
2851 int i;
2852
2853 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2854 for(i=0;i<MAX_NEG_CROP;i++) {
2855 cropTbl[i] = 0;
2856 cropTbl[i + MAX_NEG_CROP + 256] = 255;
2857 }
2858
2859 for(i=0;i<512;i++) {
2860 squareTbl[i] = (i - 256) * (i - 256);
2861 }
2862
2863 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2864 }
2865
2866
2867 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2868 {
2869 int i;
2870
2871 #ifdef CONFIG_ENCODERS
2872 if(avctx->dct_algo==FF_DCT_FASTINT)
2873 c->fdct = fdct_ifast;
2874 else if(avctx->dct_algo==FF_DCT_FAAN)
2875 c->fdct = ff_faandct;
2876 else
2877 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2878 #endif //CONFIG_ENCODERS
2879
2880 if(avctx->idct_algo==FF_IDCT_INT){
2881 c->idct_put= ff_jref_idct_put;
2882 c->idct_add= ff_jref_idct_add;
2883 c->idct = j_rev_dct;
2884 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2885 }else{ //accurate/default
2886 c->idct_put= simple_idct_put;
2887 c->idct_add= simple_idct_add;
2888 c->idct = simple_idct;
2889 c->idct_permutation_type= FF_NO_IDCT_PERM;
2890 }
2891
2892 c->get_pixels = get_pixels_c;
2893 c->diff_pixels = diff_pixels_c;
2894 c->put_pixels_clamped = put_pixels_clamped_c;
2895 c->add_pixels_clamped = add_pixels_clamped_c;
2896 c->gmc1 = gmc1_c;
2897 c->gmc = gmc_c;
2898 c->clear_blocks = clear_blocks_c;
2899 c->pix_sum = pix_sum_c;
2900 c->pix_norm1 = pix_norm1_c;
2901 c->sse[0]= sse16_c;
2902 c->sse[1]= sse8_c;
2903
2904 /* TODO [0] 16 [1] 8 */
2905 c->pix_abs16x16 = pix_abs16x16_c;
2906 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
2907 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
2908 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2909 c->pix_abs8x8 = pix_abs8x8_c;
2910 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
2911 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
2912 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2913
2914 #define dspfunc(PFX, IDX, NUM) \
2915 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
2916 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
2917 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
2918 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2919
2920 dspfunc(put, 0, 16);
2921 dspfunc(put_no_rnd, 0, 16);
2922 dspfunc(put, 1, 8);
2923 dspfunc(put_no_rnd, 1, 8);
2924 dspfunc(put, 2, 4);
2925 dspfunc(put, 3, 2);
2926
2927 dspfunc(avg, 0, 16);
2928 dspfunc(avg_no_rnd, 0, 16);
2929 dspfunc(avg, 1, 8);
2930 dspfunc(avg_no_rnd, 1, 8);
2931 dspfunc(avg, 2, 4);
2932 dspfunc(avg, 3, 2);
2933 #undef dspfunc
2934
2935 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2936 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2937 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2938 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2939 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2940 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2941 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2942 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2943 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2944
2945 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2946 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2947 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2948 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2949 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2950 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2951 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2952 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2953 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2954
2955 #define dspfunc(PFX, IDX, NUM) \
2956 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2957 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2958 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2959 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2960 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2961 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2962 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2963 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2964 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2965 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2966 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2967 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2968 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2969 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2970 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2971 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2972
2973 dspfunc(put_qpel, 0, 16);
2974 dspfunc(put_no_rnd_qpel, 0, 16);
2975
2976 dspfunc(avg_qpel, 0, 16);
2977 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2978
2979 dspfunc(put_qpel, 1, 8);
2980 dspfunc(put_no_rnd_qpel, 1, 8);
2981
2982 dspfunc(avg_qpel, 1, 8);
2983 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2984
2985 dspfunc(put_h264_qpel, 0, 16);
2986 dspfunc(put_h264_qpel, 1, 8);
2987 dspfunc(put_h264_qpel, 2, 4);
2988 dspfunc(avg_h264_qpel, 0, 16);
2989 dspfunc(avg_h264_qpel, 1, 8);
2990 dspfunc(avg_h264_qpel, 2, 4);
2991
2992 #undef dspfunc
2993 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
2994 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
2995 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
2996 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
2997 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
2998 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
2999
3000 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3001 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3002 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3003 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3004 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3005 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3006 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3007 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3008
3009 c->hadamard8_diff[0]= hadamard8_diff16_c;
3010 c->hadamard8_diff[1]= hadamard8_diff_c;
3011 c->hadamard8_abs = hadamard8_abs_c;
3012
3013 c->dct_sad[0]= dct_sad16x16_c;
3014 c->dct_sad[1]= dct_sad8x8_c;
3015
3016 c->sad[0]= sad16x16_c;
3017 c->sad[1]= sad8x8_c;
3018
3019 c->quant_psnr[0]= quant_psnr16x16_c;
3020 c->quant_psnr[1]= quant_psnr8x8_c;
3021
3022 c->rd[0]= rd16x16_c;
3023 c->rd[1]= rd8x8_c;
3024
3025 c->bit[0]= bit16x16_c;
3026 c->bit[1]= bit8x8_c;
3027
3028 c->add_bytes= add_bytes_c;
3029 c->diff_bytes= diff_bytes_c;
3030 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3031 c->bswap_buf= bswap_buf;
3032
3033 #ifdef HAVE_MMX
3034 dsputil_init_mmx(c, avctx);
3035 #endif
3036 #ifdef ARCH_ARMV4L
3037 dsputil_init_armv4l(c, avctx);
3038 #endif
3039 #ifdef HAVE_MLIB
3040 dsputil_init_mlib(c, avctx);
3041 #endif
3042 #ifdef ARCH_ALPHA
3043 dsputil_init_alpha(c, avctx);
3044 #endif
3045 #ifdef ARCH_POWERPC
3046 dsputil_init_ppc(c, avctx);
3047 #endif
3048 #ifdef HAVE_MMI
3049 dsputil_init_mmi(c, avctx);
3050 #endif
3051 #ifdef ARCH_SH4
3052 dsputil_init_sh4(c,avctx);
3053 #endif
3054
3055 switch(c->idct_permutation_type){
3056 case FF_NO_IDCT_PERM:
3057 for(i=0; i<64; i++)
3058 c->idct_permutation[i]= i;
3059 break;
3060 case FF_LIBMPEG2_IDCT_PERM:
3061 for(i=0; i<64; i++)
3062 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3063 break;
3064 case FF_SIMPLE_IDCT_PERM:
3065 for(i=0; i<64; i++)
3066 c->idct_permutation[i]= simple_mmx_permutation[i];
3067 break;
3068 case FF_TRANSPOSE_IDCT_PERM:
3069 for(i=0; i<64; i++)
3070 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3071 break;
3072 default:
3073 fprintf(stderr, "Internal error, IDCT permutation not set\n");
3074 }
3075 }
3076