63fb32e426d92a7c200ceefcf1d6ba0135fe3945
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
32 #include "faandct.h"
33
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35 uint32_t squareTbl[512];
36
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46 };
47
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59 };
60
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64];
63
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73 };
74
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84 };
85
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120 };
121
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 };
133
134 static int pix_sum_c(uint8_t * pix, int line_size)
135 {
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154 }
155
156 static int pix_norm1_c(uint8_t * pix, int line_size)
157 {
158 int s, i, j;
159 uint32_t *sq = squareTbl + 256;
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
164 #if 0
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
173 #else
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184 #else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195 #endif
196 #endif
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202 }
203
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220 }
221
222 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223 {
224 int s, i;
225 uint32_t *sq = squareTbl + 256;
226
227 s = 0;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 s += sq[pix1[4] - pix2[4]];
234 s += sq[pix1[5] - pix2[5]];
235 s += sq[pix1[6] - pix2[6]];
236 s += sq[pix1[7] - pix2[7]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241 }
242
243 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
244 {
245 int s, i;
246 uint32_t *sq = squareTbl + 256;
247
248 s = 0;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[ 0] - pix2[ 0]];
251 s += sq[pix1[ 1] - pix2[ 1]];
252 s += sq[pix1[ 2] - pix2[ 2]];
253 s += sq[pix1[ 3] - pix2[ 3]];
254 s += sq[pix1[ 4] - pix2[ 4]];
255 s += sq[pix1[ 5] - pix2[ 5]];
256 s += sq[pix1[ 6] - pix2[ 6]];
257 s += sq[pix1[ 7] - pix2[ 7]];
258 s += sq[pix1[ 8] - pix2[ 8]];
259 s += sq[pix1[ 9] - pix2[ 9]];
260 s += sq[pix1[10] - pix2[10]];
261 s += sq[pix1[11] - pix2[11]];
262 s += sq[pix1[12] - pix2[12]];
263 s += sq[pix1[13] - pix2[13]];
264 s += sq[pix1[14] - pix2[14]];
265 s += sq[pix1[15] - pix2[15]];
266
267 pix1 += line_size;
268 pix2 += line_size;
269 }
270 return s;
271 }
272
273 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
274 {
275 int i;
276
277 /* read the pixels */
278 for(i=0;i<8;i++) {
279 block[0] = pixels[0];
280 block[1] = pixels[1];
281 block[2] = pixels[2];
282 block[3] = pixels[3];
283 block[4] = pixels[4];
284 block[5] = pixels[5];
285 block[6] = pixels[6];
286 block[7] = pixels[7];
287 pixels += line_size;
288 block += 8;
289 }
290 }
291
292 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293 const uint8_t *s2, int stride){
294 int i;
295
296 /* read the pixels */
297 for(i=0;i<8;i++) {
298 block[0] = s1[0] - s2[0];
299 block[1] = s1[1] - s2[1];
300 block[2] = s1[2] - s2[2];
301 block[3] = s1[3] - s2[3];
302 block[4] = s1[4] - s2[4];
303 block[5] = s1[5] - s2[5];
304 block[6] = s1[6] - s2[6];
305 block[7] = s1[7] - s2[7];
306 s1 += stride;
307 s2 += stride;
308 block += 8;
309 }
310 }
311
312
313 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
314 int line_size)
315 {
316 int i;
317 uint8_t *cm = cropTbl + MAX_NEG_CROP;
318
319 /* read the pixels */
320 for(i=0;i<8;i++) {
321 pixels[0] = cm[block[0]];
322 pixels[1] = cm[block[1]];
323 pixels[2] = cm[block[2]];
324 pixels[3] = cm[block[3]];
325 pixels[4] = cm[block[4]];
326 pixels[5] = cm[block[5]];
327 pixels[6] = cm[block[6]];
328 pixels[7] = cm[block[7]];
329
330 pixels += line_size;
331 block += 8;
332 }
333 }
334
335 static void put_signed_pixels_clamped_c(const DCTELEM *block,
336 uint8_t *restrict pixels,
337 int line_size)
338 {
339 int i, j;
340
341 for (i = 0; i < 8; i++) {
342 for (j = 0; j < 8; j++) {
343 if (*block < -128)
344 *pixels = 0;
345 else if (*block > 127)
346 *pixels = 255;
347 else
348 *pixels = (uint8_t)(*block + 128);
349 block++;
350 pixels++;
351 }
352 pixels += (line_size - 8);
353 }
354 }
355
356 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
357 int line_size)
358 {
359 int i;
360 uint8_t *cm = cropTbl + MAX_NEG_CROP;
361
362 /* read the pixels */
363 for(i=0;i<8;i++) {
364 pixels[0] = cm[pixels[0] + block[0]];
365 pixels[1] = cm[pixels[1] + block[1]];
366 pixels[2] = cm[pixels[2] + block[2]];
367 pixels[3] = cm[pixels[3] + block[3]];
368 pixels[4] = cm[pixels[4] + block[4]];
369 pixels[5] = cm[pixels[5] + block[5]];
370 pixels[6] = cm[pixels[6] + block[6]];
371 pixels[7] = cm[pixels[7] + block[7]];
372 pixels += line_size;
373 block += 8;
374 }
375 }
376 #if 0
377
378 #define PIXOP2(OPNAME, OP) \
379 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
380 {\
381 int i;\
382 for(i=0; i<h; i++){\
383 OP(*((uint64_t*)block), LD64(pixels));\
384 pixels+=line_size;\
385 block +=line_size;\
386 }\
387 }\
388 \
389 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
390 {\
391 int i;\
392 for(i=0; i<h; i++){\
393 const uint64_t a= LD64(pixels );\
394 const uint64_t b= LD64(pixels+1);\
395 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
396 pixels+=line_size;\
397 block +=line_size;\
398 }\
399 }\
400 \
401 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
402 {\
403 int i;\
404 for(i=0; i<h; i++){\
405 const uint64_t a= LD64(pixels );\
406 const uint64_t b= LD64(pixels+1);\
407 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
408 pixels+=line_size;\
409 block +=line_size;\
410 }\
411 }\
412 \
413 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
414 {\
415 int i;\
416 for(i=0; i<h; i++){\
417 const uint64_t a= LD64(pixels );\
418 const uint64_t b= LD64(pixels+line_size);\
419 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
420 pixels+=line_size;\
421 block +=line_size;\
422 }\
423 }\
424 \
425 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
426 {\
427 int i;\
428 for(i=0; i<h; i++){\
429 const uint64_t a= LD64(pixels );\
430 const uint64_t b= LD64(pixels+line_size);\
431 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
432 pixels+=line_size;\
433 block +=line_size;\
434 }\
435 }\
436 \
437 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
438 {\
439 int i;\
440 const uint64_t a= LD64(pixels );\
441 const uint64_t b= LD64(pixels+1);\
442 uint64_t l0= (a&0x0303030303030303ULL)\
443 + (b&0x0303030303030303ULL)\
444 + 0x0202020202020202ULL;\
445 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
446 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
447 uint64_t l1,h1;\
448 \
449 pixels+=line_size;\
450 for(i=0; i<h; i+=2){\
451 uint64_t a= LD64(pixels );\
452 uint64_t b= LD64(pixels+1);\
453 l1= (a&0x0303030303030303ULL)\
454 + (b&0x0303030303030303ULL);\
455 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
456 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
457 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
458 pixels+=line_size;\
459 block +=line_size;\
460 a= LD64(pixels );\
461 b= LD64(pixels+1);\
462 l0= (a&0x0303030303030303ULL)\
463 + (b&0x0303030303030303ULL)\
464 + 0x0202020202020202ULL;\
465 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
466 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
467 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
468 pixels+=line_size;\
469 block +=line_size;\
470 }\
471 }\
472 \
473 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
474 {\
475 int i;\
476 const uint64_t a= LD64(pixels );\
477 const uint64_t b= LD64(pixels+1);\
478 uint64_t l0= (a&0x0303030303030303ULL)\
479 + (b&0x0303030303030303ULL)\
480 + 0x0101010101010101ULL;\
481 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
482 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
483 uint64_t l1,h1;\
484 \
485 pixels+=line_size;\
486 for(i=0; i<h; i+=2){\
487 uint64_t a= LD64(pixels );\
488 uint64_t b= LD64(pixels+1);\
489 l1= (a&0x0303030303030303ULL)\
490 + (b&0x0303030303030303ULL);\
491 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
492 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
493 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
494 pixels+=line_size;\
495 block +=line_size;\
496 a= LD64(pixels );\
497 b= LD64(pixels+1);\
498 l0= (a&0x0303030303030303ULL)\
499 + (b&0x0303030303030303ULL)\
500 + 0x0101010101010101ULL;\
501 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
502 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
503 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
504 pixels+=line_size;\
505 block +=line_size;\
506 }\
507 }\
508 \
509 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
510 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
511 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
512 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
513 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
514 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
515 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
516
517 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
518 #else // 64 bit variant
519
520 #define PIXOP2(OPNAME, OP) \
521 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
522 int i;\
523 for(i=0; i<h; i++){\
524 OP(*((uint16_t*)(block )), LD16(pixels ));\
525 pixels+=line_size;\
526 block +=line_size;\
527 }\
528 }\
529 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
530 int i;\
531 for(i=0; i<h; i++){\
532 OP(*((uint32_t*)(block )), LD32(pixels ));\
533 pixels+=line_size;\
534 block +=line_size;\
535 }\
536 }\
537 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
538 int i;\
539 for(i=0; i<h; i++){\
540 OP(*((uint32_t*)(block )), LD32(pixels ));\
541 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
542 pixels+=line_size;\
543 block +=line_size;\
544 }\
545 }\
546 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
547 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
548 }\
549 \
550 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
551 int src_stride1, int src_stride2, int h){\
552 int i;\
553 for(i=0; i<h; i++){\
554 uint32_t a,b;\
555 a= LD32(&src1[i*src_stride1 ]);\
556 b= LD32(&src2[i*src_stride2 ]);\
557 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
558 a= LD32(&src1[i*src_stride1+4]);\
559 b= LD32(&src2[i*src_stride2+4]);\
560 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
561 }\
562 }\
563 \
564 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
565 int src_stride1, int src_stride2, int h){\
566 int i;\
567 for(i=0; i<h; i++){\
568 uint32_t a,b;\
569 a= LD32(&src1[i*src_stride1 ]);\
570 b= LD32(&src2[i*src_stride2 ]);\
571 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
572 a= LD32(&src1[i*src_stride1+4]);\
573 b= LD32(&src2[i*src_stride2+4]);\
574 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
575 }\
576 }\
577 \
578 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579 int src_stride1, int src_stride2, int h){\
580 int i;\
581 for(i=0; i<h; i++){\
582 uint32_t a,b;\
583 a= LD32(&src1[i*src_stride1 ]);\
584 b= LD32(&src2[i*src_stride2 ]);\
585 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
586 }\
587 }\
588 \
589 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
590 int src_stride1, int src_stride2, int h){\
591 int i;\
592 for(i=0; i<h; i++){\
593 uint32_t a,b;\
594 a= LD16(&src1[i*src_stride1 ]);\
595 b= LD16(&src2[i*src_stride2 ]);\
596 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
597 }\
598 }\
599 \
600 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
601 int src_stride1, int src_stride2, int h){\
602 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
603 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
604 }\
605 \
606 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
607 int src_stride1, int src_stride2, int h){\
608 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
609 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
610 }\
611 \
612 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
613 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
614 }\
615 \
616 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
617 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
618 }\
619 \
620 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
621 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
622 }\
623 \
624 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
625 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
626 }\
627 \
628 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
629 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
630 int i;\
631 for(i=0; i<h; i++){\
632 uint32_t a, b, c, d, l0, l1, h0, h1;\
633 a= LD32(&src1[i*src_stride1]);\
634 b= LD32(&src2[i*src_stride2]);\
635 c= LD32(&src3[i*src_stride3]);\
636 d= LD32(&src4[i*src_stride4]);\
637 l0= (a&0x03030303UL)\
638 + (b&0x03030303UL)\
639 + 0x02020202UL;\
640 h0= ((a&0xFCFCFCFCUL)>>2)\
641 + ((b&0xFCFCFCFCUL)>>2);\
642 l1= (c&0x03030303UL)\
643 + (d&0x03030303UL);\
644 h1= ((c&0xFCFCFCFCUL)>>2)\
645 + ((d&0xFCFCFCFCUL)>>2);\
646 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
647 a= LD32(&src1[i*src_stride1+4]);\
648 b= LD32(&src2[i*src_stride2+4]);\
649 c= LD32(&src3[i*src_stride3+4]);\
650 d= LD32(&src4[i*src_stride4+4]);\
651 l0= (a&0x03030303UL)\
652 + (b&0x03030303UL)\
653 + 0x02020202UL;\
654 h0= ((a&0xFCFCFCFCUL)>>2)\
655 + ((b&0xFCFCFCFCUL)>>2);\
656 l1= (c&0x03030303UL)\
657 + (d&0x03030303UL);\
658 h1= ((c&0xFCFCFCFCUL)>>2)\
659 + ((d&0xFCFCFCFCUL)>>2);\
660 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
661 }\
662 }\
663 \
664 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
665 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
666 }\
667 \
668 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
669 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
670 }\
671 \
672 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
673 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
674 }\
675 \
676 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
677 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
678 }\
679 \
680 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
682 int i;\
683 for(i=0; i<h; i++){\
684 uint32_t a, b, c, d, l0, l1, h0, h1;\
685 a= LD32(&src1[i*src_stride1]);\
686 b= LD32(&src2[i*src_stride2]);\
687 c= LD32(&src3[i*src_stride3]);\
688 d= LD32(&src4[i*src_stride4]);\
689 l0= (a&0x03030303UL)\
690 + (b&0x03030303UL)\
691 + 0x01010101UL;\
692 h0= ((a&0xFCFCFCFCUL)>>2)\
693 + ((b&0xFCFCFCFCUL)>>2);\
694 l1= (c&0x03030303UL)\
695 + (d&0x03030303UL);\
696 h1= ((c&0xFCFCFCFCUL)>>2)\
697 + ((d&0xFCFCFCFCUL)>>2);\
698 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
699 a= LD32(&src1[i*src_stride1+4]);\
700 b= LD32(&src2[i*src_stride2+4]);\
701 c= LD32(&src3[i*src_stride3+4]);\
702 d= LD32(&src4[i*src_stride4+4]);\
703 l0= (a&0x03030303UL)\
704 + (b&0x03030303UL)\
705 + 0x01010101UL;\
706 h0= ((a&0xFCFCFCFCUL)>>2)\
707 + ((b&0xFCFCFCFCUL)>>2);\
708 l1= (c&0x03030303UL)\
709 + (d&0x03030303UL);\
710 h1= ((c&0xFCFCFCFCUL)>>2)\
711 + ((d&0xFCFCFCFCUL)>>2);\
712 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
713 }\
714 }\
715 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
716 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
717 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
718 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
719 }\
720 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
721 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
722 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
723 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
724 }\
725 \
726 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
727 {\
728 int i, a0, b0, a1, b1;\
729 a0= pixels[0];\
730 b0= pixels[1] + 2;\
731 a0 += b0;\
732 b0 += pixels[2];\
733 \
734 pixels+=line_size;\
735 for(i=0; i<h; i+=2){\
736 a1= pixels[0];\
737 b1= pixels[1];\
738 a1 += b1;\
739 b1 += pixels[2];\
740 \
741 block[0]= (a1+a0)>>2; /* FIXME non put */\
742 block[1]= (b1+b0)>>2;\
743 \
744 pixels+=line_size;\
745 block +=line_size;\
746 \
747 a0= pixels[0];\
748 b0= pixels[1] + 2;\
749 a0 += b0;\
750 b0 += pixels[2];\
751 \
752 block[0]= (a1+a0)>>2;\
753 block[1]= (b1+b0)>>2;\
754 pixels+=line_size;\
755 block +=line_size;\
756 }\
757 }\
758 \
759 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
760 {\
761 int i;\
762 const uint32_t a= LD32(pixels );\
763 const uint32_t b= LD32(pixels+1);\
764 uint32_t l0= (a&0x03030303UL)\
765 + (b&0x03030303UL)\
766 + 0x02020202UL;\
767 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
768 + ((b&0xFCFCFCFCUL)>>2);\
769 uint32_t l1,h1;\
770 \
771 pixels+=line_size;\
772 for(i=0; i<h; i+=2){\
773 uint32_t a= LD32(pixels );\
774 uint32_t b= LD32(pixels+1);\
775 l1= (a&0x03030303UL)\
776 + (b&0x03030303UL);\
777 h1= ((a&0xFCFCFCFCUL)>>2)\
778 + ((b&0xFCFCFCFCUL)>>2);\
779 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
780 pixels+=line_size;\
781 block +=line_size;\
782 a= LD32(pixels );\
783 b= LD32(pixels+1);\
784 l0= (a&0x03030303UL)\
785 + (b&0x03030303UL)\
786 + 0x02020202UL;\
787 h0= ((a&0xFCFCFCFCUL)>>2)\
788 + ((b&0xFCFCFCFCUL)>>2);\
789 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
790 pixels+=line_size;\
791 block +=line_size;\
792 }\
793 }\
794 \
795 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
796 {\
797 int j;\
798 for(j=0; j<2; j++){\
799 int i;\
800 const uint32_t a= LD32(pixels );\
801 const uint32_t b= LD32(pixels+1);\
802 uint32_t l0= (a&0x03030303UL)\
803 + (b&0x03030303UL)\
804 + 0x02020202UL;\
805 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
806 + ((b&0xFCFCFCFCUL)>>2);\
807 uint32_t l1,h1;\
808 \
809 pixels+=line_size;\
810 for(i=0; i<h; i+=2){\
811 uint32_t a= LD32(pixels );\
812 uint32_t b= LD32(pixels+1);\
813 l1= (a&0x03030303UL)\
814 + (b&0x03030303UL);\
815 h1= ((a&0xFCFCFCFCUL)>>2)\
816 + ((b&0xFCFCFCFCUL)>>2);\
817 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
818 pixels+=line_size;\
819 block +=line_size;\
820 a= LD32(pixels );\
821 b= LD32(pixels+1);\
822 l0= (a&0x03030303UL)\
823 + (b&0x03030303UL)\
824 + 0x02020202UL;\
825 h0= ((a&0xFCFCFCFCUL)>>2)\
826 + ((b&0xFCFCFCFCUL)>>2);\
827 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
828 pixels+=line_size;\
829 block +=line_size;\
830 }\
831 pixels+=4-line_size*(h+1);\
832 block +=4-line_size*h;\
833 }\
834 }\
835 \
836 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
837 {\
838 int j;\
839 for(j=0; j<2; j++){\
840 int i;\
841 const uint32_t a= LD32(pixels );\
842 const uint32_t b= LD32(pixels+1);\
843 uint32_t l0= (a&0x03030303UL)\
844 + (b&0x03030303UL)\
845 + 0x01010101UL;\
846 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
847 + ((b&0xFCFCFCFCUL)>>2);\
848 uint32_t l1,h1;\
849 \
850 pixels+=line_size;\
851 for(i=0; i<h; i+=2){\
852 uint32_t a= LD32(pixels );\
853 uint32_t b= LD32(pixels+1);\
854 l1= (a&0x03030303UL)\
855 + (b&0x03030303UL);\
856 h1= ((a&0xFCFCFCFCUL)>>2)\
857 + ((b&0xFCFCFCFCUL)>>2);\
858 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
859 pixels+=line_size;\
860 block +=line_size;\
861 a= LD32(pixels );\
862 b= LD32(pixels+1);\
863 l0= (a&0x03030303UL)\
864 + (b&0x03030303UL)\
865 + 0x01010101UL;\
866 h0= ((a&0xFCFCFCFCUL)>>2)\
867 + ((b&0xFCFCFCFCUL)>>2);\
868 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
869 pixels+=line_size;\
870 block +=line_size;\
871 }\
872 pixels+=4-line_size*(h+1);\
873 block +=4-line_size*h;\
874 }\
875 }\
876 \
877 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
878 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
879 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
880 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
881 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
883 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
885
886 #define op_avg(a, b) a = rnd_avg32(a, b)
887 #endif
888 #define op_put(a, b) a = b
889
890 PIXOP2(avg, op_avg)
891 PIXOP2(put, op_put)
892 #undef op_avg
893 #undef op_put
894
895 #define avg2(a,b) ((a+b+1)>>1)
896 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
897
898 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
899 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
900 }
901
902 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
903 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
904 }
905
906 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
907 {
908 const int A=(16-x16)*(16-y16);
909 const int B=( x16)*(16-y16);
910 const int C=(16-x16)*( y16);
911 const int D=( x16)*( y16);
912 int i;
913
914 for(i=0; i<h; i++)
915 {
916 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
917 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
918 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
919 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
920 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
921 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
922 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
923 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
924 dst+= stride;
925 src+= stride;
926 }
927 }
928
929 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
930 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
931 {
932 int y, vx, vy;
933 const int s= 1<<shift;
934
935 width--;
936 height--;
937
938 for(y=0; y<h; y++){
939 int x;
940
941 vx= ox;
942 vy= oy;
943 for(x=0; x<8; x++){ //XXX FIXME optimize
944 int src_x, src_y, frac_x, frac_y, index;
945
946 src_x= vx>>16;
947 src_y= vy>>16;
948 frac_x= src_x&(s-1);
949 frac_y= src_y&(s-1);
950 src_x>>=shift;
951 src_y>>=shift;
952
953 if((unsigned)src_x < width){
954 if((unsigned)src_y < height){
955 index= src_x + src_y*stride;
956 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
957 + src[index +1]* frac_x )*(s-frac_y)
958 + ( src[index+stride ]*(s-frac_x)
959 + src[index+stride+1]* frac_x )* frac_y
960 + r)>>(shift*2);
961 }else{
962 index= src_x + clip(src_y, 0, height)*stride;
963 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
964 + src[index +1]* frac_x )*s
965 + r)>>(shift*2);
966 }
967 }else{
968 if((unsigned)src_y < height){
969 index= clip(src_x, 0, width) + src_y*stride;
970 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
971 + src[index+stride ]* frac_y )*s
972 + r)>>(shift*2);
973 }else{
974 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
975 dst[y*stride + x]= src[index ];
976 }
977 }
978
979 vx+= dxx;
980 vy+= dyx;
981 }
982 ox += dxy;
983 oy += dyy;
984 }
985 }
986
987 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
988 switch(width){
989 case 2: put_pixels2_c (dst, src, stride, height); break;
990 case 4: put_pixels4_c (dst, src, stride, height); break;
991 case 8: put_pixels8_c (dst, src, stride, height); break;
992 case 16:put_pixels16_c(dst, src, stride, height); break;
993 }
994 }
995
996 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
997 int i,j;
998 for (i=0; i < height; i++) {
999 for (j=0; j < width; j++) {
1000 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1001 }
1002 src += stride;
1003 dst += stride;
1004 }
1005 }
1006
1007 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1008 int i,j;
1009 for (i=0; i < height; i++) {
1010 for (j=0; j < width; j++) {
1011 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1012 }
1013 src += stride;
1014 dst += stride;
1015 }
1016 }
1017
1018 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1019 int i,j;
1020 for (i=0; i < height; i++) {
1021 for (j=0; j < width; j++) {
1022 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1023 }
1024 src += stride;
1025 dst += stride;
1026 }
1027 }
1028
1029 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1030 int i,j;
1031 for (i=0; i < height; i++) {
1032 for (j=0; j < width; j++) {
1033 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1034 }
1035 src += stride;
1036 dst += stride;
1037 }
1038 }
1039
1040 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1041 int i,j;
1042 for (i=0; i < height; i++) {
1043 for (j=0; j < width; j++) {
1044 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1045 }
1046 src += stride;
1047 dst += stride;
1048 }
1049 }
1050
1051 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1052 int i,j;
1053 for (i=0; i < height; i++) {
1054 for (j=0; j < width; j++) {
1055 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1056 }
1057 src += stride;
1058 dst += stride;
1059 }
1060 }
1061
1062 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1063 int i,j;
1064 for (i=0; i < height; i++) {
1065 for (j=0; j < width; j++) {
1066 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1067 }
1068 src += stride;
1069 dst += stride;
1070 }
1071 }
1072
1073 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1074 int i,j;
1075 for (i=0; i < height; i++) {
1076 for (j=0; j < width; j++) {
1077 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1078 }
1079 src += stride;
1080 dst += stride;
1081 }
1082 }
1083
1084 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1085 switch(width){
1086 case 2: avg_pixels2_c (dst, src, stride, height); break;
1087 case 4: avg_pixels4_c (dst, src, stride, height); break;
1088 case 8: avg_pixels8_c (dst, src, stride, height); break;
1089 case 16:avg_pixels16_c(dst, src, stride, height); break;
1090 }
1091 }
1092
1093 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1094 int i,j;
1095 for (i=0; i < height; i++) {
1096 for (j=0; j < width; j++) {
1097 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1098 }
1099 src += stride;
1100 dst += stride;
1101 }
1102 }
1103
1104 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1105 int i,j;
1106 for (i=0; i < height; i++) {
1107 for (j=0; j < width; j++) {
1108 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1109 }
1110 src += stride;
1111 dst += stride;
1112 }
1113 }
1114
1115 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1116 int i,j;
1117 for (i=0; i < height; i++) {
1118 for (j=0; j < width; j++) {
1119 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1120 }
1121 src += stride;
1122 dst += stride;
1123 }
1124 }
1125
1126 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1127 int i,j;
1128 for (i=0; i < height; i++) {
1129 for (j=0; j < width; j++) {
1130 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1131 }
1132 src += stride;
1133 dst += stride;
1134 }
1135 }
1136
1137 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1138 int i,j;
1139 for (i=0; i < height; i++) {
1140 for (j=0; j < width; j++) {
1141 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1142 }
1143 src += stride;
1144 dst += stride;
1145 }
1146 }
1147
1148 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1149 int i,j;
1150 for (i=0; i < height; i++) {
1151 for (j=0; j < width; j++) {
1152 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1153 }
1154 src += stride;
1155 dst += stride;
1156 }
1157 }
1158
1159 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1160 int i,j;
1161 for (i=0; i < height; i++) {
1162 for (j=0; j < width; j++) {
1163 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1164 }
1165 src += stride;
1166 dst += stride;
1167 }
1168 }
1169
1170 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1171 int i,j;
1172 for (i=0; i < height; i++) {
1173 for (j=0; j < width; j++) {
1174 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1175 }
1176 src += stride;
1177 dst += stride;
1178 }
1179 }
1180 #if 0
1181 #define TPEL_WIDTH(width)\
1182 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1183 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1184 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1185 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1186 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1187 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1188 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1189 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1190 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1191 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1192 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1193 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1194 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1195 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1196 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1197 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1198 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1199 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1200 #endif
1201
1202 #define H264_CHROMA_MC(OPNAME, OP)\
1203 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1204 const int A=(8-x)*(8-y);\
1205 const int B=( x)*(8-y);\
1206 const int C=(8-x)*( y);\
1207 const int D=( x)*( y);\
1208 int i;\
1209 \
1210 assert(x<8 && y<8 && x>=0 && y>=0);\
1211 \
1212 for(i=0; i<h; i++)\
1213 {\
1214 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1215 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1216 dst+= stride;\
1217 src+= stride;\
1218 }\
1219 }\
1220 \
1221 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1222 const int A=(8-x)*(8-y);\
1223 const int B=( x)*(8-y);\
1224 const int C=(8-x)*( y);\
1225 const int D=( x)*( y);\
1226 int i;\
1227 \
1228 assert(x<8 && y<8 && x>=0 && y>=0);\
1229 \
1230 for(i=0; i<h; i++)\
1231 {\
1232 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1233 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1234 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1235 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1236 dst+= stride;\
1237 src+= stride;\
1238 }\
1239 }\
1240 \
1241 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1242 const int A=(8-x)*(8-y);\
1243 const int B=( x)*(8-y);\
1244 const int C=(8-x)*( y);\
1245 const int D=( x)*( y);\
1246 int i;\
1247 \
1248 assert(x<8 && y<8 && x>=0 && y>=0);\
1249 \
1250 for(i=0; i<h; i++)\
1251 {\
1252 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1253 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1254 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1255 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1256 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1257 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1258 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1259 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1260 dst+= stride;\
1261 src+= stride;\
1262 }\
1263 }
1264
1265 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1266 #define op_put(a, b) a = (((b) + 32)>>6)
1267
1268 H264_CHROMA_MC(put_ , op_put)
1269 H264_CHROMA_MC(avg_ , op_avg)
1270 #undef op_avg
1271 #undef op_put
1272
1273 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1274 {
1275 int i;
1276 for(i=0; i<h; i++)
1277 {
1278 ST32(dst , LD32(src ));
1279 dst+=dstStride;
1280 src+=srcStride;
1281 }
1282 }
1283
1284 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1285 {
1286 int i;
1287 for(i=0; i<h; i++)
1288 {
1289 ST32(dst , LD32(src ));
1290 ST32(dst+4 , LD32(src+4 ));
1291 dst+=dstStride;
1292 src+=srcStride;
1293 }
1294 }
1295
1296 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297 {
1298 int i;
1299 for(i=0; i<h; i++)
1300 {
1301 ST32(dst , LD32(src ));
1302 ST32(dst+4 , LD32(src+4 ));
1303 ST32(dst+8 , LD32(src+8 ));
1304 ST32(dst+12, LD32(src+12));
1305 dst+=dstStride;
1306 src+=srcStride;
1307 }
1308 }
1309
1310 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1311 {
1312 int i;
1313 for(i=0; i<h; i++)
1314 {
1315 ST32(dst , LD32(src ));
1316 ST32(dst+4 , LD32(src+4 ));
1317 ST32(dst+8 , LD32(src+8 ));
1318 ST32(dst+12, LD32(src+12));
1319 dst[16]= src[16];
1320 dst+=dstStride;
1321 src+=srcStride;
1322 }
1323 }
1324
1325 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1326 {
1327 int i;
1328 for(i=0; i<h; i++)
1329 {
1330 ST32(dst , LD32(src ));
1331 ST32(dst+4 , LD32(src+4 ));
1332 dst[8]= src[8];
1333 dst+=dstStride;
1334 src+=srcStride;
1335 }
1336 }
1337
1338
1339 #define QPEL_MC(r, OPNAME, RND, OP) \
1340 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1341 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1342 int i;\
1343 for(i=0; i<h; i++)\
1344 {\
1345 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1346 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1347 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1348 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1349 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1350 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1351 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1352 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1353 dst+=dstStride;\
1354 src+=srcStride;\
1355 }\
1356 }\
1357 \
1358 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1359 const int w=8;\
1360 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1361 int i;\
1362 for(i=0; i<w; i++)\
1363 {\
1364 const int src0= src[0*srcStride];\
1365 const int src1= src[1*srcStride];\
1366 const int src2= src[2*srcStride];\
1367 const int src3= src[3*srcStride];\
1368 const int src4= src[4*srcStride];\
1369 const int src5= src[5*srcStride];\
1370 const int src6= src[6*srcStride];\
1371 const int src7= src[7*srcStride];\
1372 const int src8= src[8*srcStride];\
1373 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1374 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1375 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1376 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1377 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1378 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1379 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1380 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1381 dst++;\
1382 src++;\
1383 }\
1384 }\
1385 \
1386 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1387 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1388 int i;\
1389 \
1390 for(i=0; i<h; i++)\
1391 {\
1392 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1393 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1394 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1395 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1396 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1397 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1398 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1399 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1400 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1401 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1402 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1403 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1404 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1405 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1406 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1407 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1408 dst+=dstStride;\
1409 src+=srcStride;\
1410 }\
1411 }\
1412 \
1413 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1414 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1415 int i;\
1416 const int w=16;\
1417 for(i=0; i<w; i++)\
1418 {\
1419 const int src0= src[0*srcStride];\
1420 const int src1= src[1*srcStride];\
1421 const int src2= src[2*srcStride];\
1422 const int src3= src[3*srcStride];\
1423 const int src4= src[4*srcStride];\
1424 const int src5= src[5*srcStride];\
1425 const int src6= src[6*srcStride];\
1426 const int src7= src[7*srcStride];\
1427 const int src8= src[8*srcStride];\
1428 const int src9= src[9*srcStride];\
1429 const int src10= src[10*srcStride];\
1430 const int src11= src[11*srcStride];\
1431 const int src12= src[12*srcStride];\
1432 const int src13= src[13*srcStride];\
1433 const int src14= src[14*srcStride];\
1434 const int src15= src[15*srcStride];\
1435 const int src16= src[16*srcStride];\
1436 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1437 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1438 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1439 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1440 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1441 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1442 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1443 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1444 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1445 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1446 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1447 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1448 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1449 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1450 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1451 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1452 dst++;\
1453 src++;\
1454 }\
1455 }\
1456 \
1457 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1458 OPNAME ## pixels8_c(dst, src, stride, 8);\
1459 }\
1460 \
1461 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1462 uint8_t half[64];\
1463 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1464 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1465 }\
1466 \
1467 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1468 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1469 }\
1470 \
1471 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1472 uint8_t half[64];\
1473 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1474 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1475 }\
1476 \
1477 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1478 uint8_t full[16*9];\
1479 uint8_t half[64];\
1480 copy_block9(full, src, 16, stride, 9);\
1481 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1482 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1483 }\
1484 \
1485 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1486 uint8_t full[16*9];\
1487 copy_block9(full, src, 16, stride, 9);\
1488 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1489 }\
1490 \
1491 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1492 uint8_t full[16*9];\
1493 uint8_t half[64];\
1494 copy_block9(full, src, 16, stride, 9);\
1495 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1496 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1497 }\
1498 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[16*9];\
1500 uint8_t halfH[72];\
1501 uint8_t halfV[64];\
1502 uint8_t halfHV[64];\
1503 copy_block9(full, src, 16, stride, 9);\
1504 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1508 }\
1509 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1510 uint8_t full[16*9];\
1511 uint8_t halfH[72];\
1512 uint8_t halfHV[64];\
1513 copy_block9(full, src, 16, stride, 9);\
1514 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1518 }\
1519 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520 uint8_t full[16*9];\
1521 uint8_t halfH[72];\
1522 uint8_t halfV[64];\
1523 uint8_t halfHV[64];\
1524 copy_block9(full, src, 16, stride, 9);\
1525 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1526 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1529 }\
1530 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1531 uint8_t full[16*9];\
1532 uint8_t halfH[72];\
1533 uint8_t halfHV[64];\
1534 copy_block9(full, src, 16, stride, 9);\
1535 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1539 }\
1540 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541 uint8_t full[16*9];\
1542 uint8_t halfH[72];\
1543 uint8_t halfV[64];\
1544 uint8_t halfHV[64];\
1545 copy_block9(full, src, 16, stride, 9);\
1546 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1547 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1548 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1549 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1550 }\
1551 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1552 uint8_t full[16*9];\
1553 uint8_t halfH[72];\
1554 uint8_t halfHV[64];\
1555 copy_block9(full, src, 16, stride, 9);\
1556 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1558 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1560 }\
1561 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1562 uint8_t full[16*9];\
1563 uint8_t halfH[72];\
1564 uint8_t halfV[64];\
1565 uint8_t halfHV[64];\
1566 copy_block9(full, src, 16, stride, 9);\
1567 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1568 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1569 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1570 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1571 }\
1572 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1573 uint8_t full[16*9];\
1574 uint8_t halfH[72];\
1575 uint8_t halfHV[64];\
1576 copy_block9(full, src, 16, stride, 9);\
1577 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1578 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1579 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1580 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1581 }\
1582 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1583 uint8_t halfH[72];\
1584 uint8_t halfHV[64];\
1585 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1586 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1587 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1588 }\
1589 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1590 uint8_t halfH[72];\
1591 uint8_t halfHV[64];\
1592 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1593 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1594 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1595 }\
1596 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1597 uint8_t full[16*9];\
1598 uint8_t halfH[72];\
1599 uint8_t halfV[64];\
1600 uint8_t halfHV[64];\
1601 copy_block9(full, src, 16, stride, 9);\
1602 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1603 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1604 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1605 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1606 }\
1607 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1608 uint8_t full[16*9];\
1609 uint8_t halfH[72];\
1610 copy_block9(full, src, 16, stride, 9);\
1611 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1612 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1613 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1614 }\
1615 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1616 uint8_t full[16*9];\
1617 uint8_t halfH[72];\
1618 uint8_t halfV[64];\
1619 uint8_t halfHV[64];\
1620 copy_block9(full, src, 16, stride, 9);\
1621 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1622 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1623 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1624 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1625 }\
1626 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1627 uint8_t full[16*9];\
1628 uint8_t halfH[72];\
1629 copy_block9(full, src, 16, stride, 9);\
1630 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1631 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1632 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1633 }\
1634 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1635 uint8_t halfH[72];\
1636 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1637 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1638 }\
1639 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1640 OPNAME ## pixels16_c(dst, src, stride, 16);\
1641 }\
1642 \
1643 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1644 uint8_t half[256];\
1645 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1646 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1647 }\
1648 \
1649 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1650 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1651 }\
1652 \
1653 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1654 uint8_t half[256];\
1655 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1656 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1657 }\
1658 \
1659 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1660 uint8_t full[24*17];\
1661 uint8_t half[256];\
1662 copy_block17(full, src, 24, stride, 17);\
1663 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1664 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1665 }\
1666 \
1667 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1668 uint8_t full[24*17];\
1669 copy_block17(full, src, 24, stride, 17);\
1670 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1671 }\
1672 \
1673 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1674 uint8_t full[24*17];\
1675 uint8_t half[256];\
1676 copy_block17(full, src, 24, stride, 17);\
1677 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1678 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1679 }\
1680 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[24*17];\
1682 uint8_t halfH[272];\
1683 uint8_t halfV[256];\
1684 uint8_t halfHV[256];\
1685 copy_block17(full, src, 24, stride, 17);\
1686 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690 }\
1691 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1692 uint8_t full[24*17];\
1693 uint8_t halfH[272];\
1694 uint8_t halfHV[256];\
1695 copy_block17(full, src, 24, stride, 17);\
1696 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1700 }\
1701 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702 uint8_t full[24*17];\
1703 uint8_t halfH[272];\
1704 uint8_t halfV[256];\
1705 uint8_t halfHV[256];\
1706 copy_block17(full, src, 24, stride, 17);\
1707 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1708 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711 }\
1712 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[24*17];\
1714 uint8_t halfH[272];\
1715 uint8_t halfHV[256];\
1716 copy_block17(full, src, 24, stride, 17);\
1717 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1721 }\
1722 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723 uint8_t full[24*17];\
1724 uint8_t halfH[272];\
1725 uint8_t halfV[256];\
1726 uint8_t halfHV[256];\
1727 copy_block17(full, src, 24, stride, 17);\
1728 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1729 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1730 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1731 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1732 }\
1733 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1734 uint8_t full[24*17];\
1735 uint8_t halfH[272];\
1736 uint8_t halfHV[256];\
1737 copy_block17(full, src, 24, stride, 17);\
1738 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1740 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1742 }\
1743 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t full[24*17];\
1745 uint8_t halfH[272];\
1746 uint8_t halfV[256];\
1747 uint8_t halfHV[256];\
1748 copy_block17(full, src, 24, stride, 17);\
1749 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1750 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1751 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1752 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1753 }\
1754 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1755 uint8_t full[24*17];\
1756 uint8_t halfH[272];\
1757 uint8_t halfHV[256];\
1758 copy_block17(full, src, 24, stride, 17);\
1759 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1760 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1761 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1762 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1763 }\
1764 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1765 uint8_t halfH[272];\
1766 uint8_t halfHV[256];\
1767 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1768 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1769 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1770 }\
1771 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1772 uint8_t halfH[272];\
1773 uint8_t halfHV[256];\
1774 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1775 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1776 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1777 }\
1778 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t full[24*17];\
1780 uint8_t halfH[272];\
1781 uint8_t halfV[256];\
1782 uint8_t halfHV[256];\
1783 copy_block17(full, src, 24, stride, 17);\
1784 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1785 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1786 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1787 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1788 }\
1789 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1790 uint8_t full[24*17];\
1791 uint8_t halfH[272];\
1792 copy_block17(full, src, 24, stride, 17);\
1793 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1794 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1795 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1796 }\
1797 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1798 uint8_t full[24*17];\
1799 uint8_t halfH[272];\
1800 uint8_t halfV[256];\
1801 uint8_t halfHV[256];\
1802 copy_block17(full, src, 24, stride, 17);\
1803 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1804 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1805 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1806 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1807 }\
1808 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1809 uint8_t full[24*17];\
1810 uint8_t halfH[272];\
1811 copy_block17(full, src, 24, stride, 17);\
1812 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1813 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1814 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1815 }\
1816 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1817 uint8_t halfH[272];\
1818 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1819 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1820 }
1821
1822 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1823 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1824 #define op_put(a, b) a = cm[((b) + 16)>>5]
1825 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1826
1827 QPEL_MC(0, put_ , _ , op_put)
1828 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1829 QPEL_MC(0, avg_ , _ , op_avg)
1830 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1831 #undef op_avg
1832 #undef op_avg_no_rnd
1833 #undef op_put
1834 #undef op_put_no_rnd
1835
1836 #if 1
1837 #define H264_LOWPASS(OPNAME, OP, OP2) \
1838 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1839 const int h=4;\
1840 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1841 int i;\
1842 for(i=0; i<h; i++)\
1843 {\
1844 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1845 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1846 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1847 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1848 dst+=dstStride;\
1849 src+=srcStride;\
1850 }\
1851 }\
1852 \
1853 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1854 const int w=4;\
1855 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1856 int i;\
1857 for(i=0; i<w; i++)\
1858 {\
1859 const int srcB= src[-2*srcStride];\
1860 const int srcA= src[-1*srcStride];\
1861 const int src0= src[0 *srcStride];\
1862 const int src1= src[1 *srcStride];\
1863 const int src2= src[2 *srcStride];\
1864 const int src3= src[3 *srcStride];\
1865 const int src4= src[4 *srcStride];\
1866 const int src5= src[5 *srcStride];\
1867 const int src6= src[6 *srcStride];\
1868 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1869 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1870 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1871 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1872 dst++;\
1873 src++;\
1874 }\
1875 }\
1876 \
1877 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1878 const int h=4;\
1879 const int w=4;\
1880 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1881 int i;\
1882 src -= 2*srcStride;\
1883 for(i=0; i<h+5; i++)\
1884 {\
1885 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1886 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1887 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1888 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1889 tmp+=tmpStride;\
1890 src+=srcStride;\
1891 }\
1892 tmp -= tmpStride*(h+5-2);\
1893 for(i=0; i<w; i++)\
1894 {\
1895 const int tmpB= tmp[-2*tmpStride];\
1896 const int tmpA= tmp[-1*tmpStride];\
1897 const int tmp0= tmp[0 *tmpStride];\
1898 const int tmp1= tmp[1 *tmpStride];\
1899 const int tmp2= tmp[2 *tmpStride];\
1900 const int tmp3= tmp[3 *tmpStride];\
1901 const int tmp4= tmp[4 *tmpStride];\
1902 const int tmp5= tmp[5 *tmpStride];\
1903 const int tmp6= tmp[6 *tmpStride];\
1904 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1905 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1906 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1907 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1908 dst++;\
1909 tmp++;\
1910 }\
1911 }\
1912 \
1913 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1914 const int h=8;\
1915 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1916 int i;\
1917 for(i=0; i<h; i++)\
1918 {\
1919 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1920 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1921 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1922 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1923 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1924 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1925 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1926 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1927 dst+=dstStride;\
1928 src+=srcStride;\
1929 }\
1930 }\
1931 \
1932 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1933 const int w=8;\
1934 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1935 int i;\
1936 for(i=0; i<w; i++)\
1937 {\
1938 const int srcB= src[-2*srcStride];\
1939 const int srcA= src[-1*srcStride];\
1940 const int src0= src[0 *srcStride];\
1941 const int src1= src[1 *srcStride];\
1942 const int src2= src[2 *srcStride];\
1943 const int src3= src[3 *srcStride];\
1944 const int src4= src[4 *srcStride];\
1945 const int src5= src[5 *srcStride];\
1946 const int src6= src[6 *srcStride];\
1947 const int src7= src[7 *srcStride];\
1948 const int src8= src[8 *srcStride];\
1949 const int src9= src[9 *srcStride];\
1950 const int src10=src[10*srcStride];\
1951 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1952 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1953 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1954 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1955 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1956 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1957 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1958 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1959 dst++;\
1960 src++;\
1961 }\
1962 }\
1963 \
1964 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1965 const int h=8;\
1966 const int w=8;\
1967 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1968 int i;\
1969 src -= 2*srcStride;\
1970 for(i=0; i<h+5; i++)\
1971 {\
1972 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1973 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1974 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1975 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1976 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1977 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1978 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1979 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1980 tmp+=tmpStride;\
1981 src+=srcStride;\
1982 }\
1983 tmp -= tmpStride*(h+5-2);\
1984 for(i=0; i<w; i++)\
1985 {\
1986 const int tmpB= tmp[-2*tmpStride];\
1987 const int tmpA= tmp[-1*tmpStride];\
1988 const int tmp0= tmp[0 *tmpStride];\
1989 const int tmp1= tmp[1 *tmpStride];\
1990 const int tmp2= tmp[2 *tmpStride];\
1991 const int tmp3= tmp[3 *tmpStride];\
1992 const int tmp4= tmp[4 *tmpStride];\
1993 const int tmp5= tmp[5 *tmpStride];\
1994 const int tmp6= tmp[6 *tmpStride];\
1995 const int tmp7= tmp[7 *tmpStride];\
1996 const int tmp8= tmp[8 *tmpStride];\
1997 const int tmp9= tmp[9 *tmpStride];\
1998 const int tmp10=tmp[10*tmpStride];\
1999 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2000 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2001 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2002 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2003 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2004 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2005 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2006 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2007 dst++;\
2008 tmp++;\
2009 }\
2010 }\
2011 \
2012 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2013 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2014 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2015 src += 8*srcStride;\
2016 dst += 8*dstStride;\
2017 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2018 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2019 }\
2020 \
2021 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2022 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2023 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2024 src += 8*srcStride;\
2025 dst += 8*dstStride;\
2026 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2027 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2028 }\
2029 \
2030 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2031 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2032 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2033 src += 8*srcStride;\
2034 tmp += 8*tmpStride;\
2035 dst += 8*dstStride;\
2036 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2037 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2038 }\
2039
2040 #define H264_MC(OPNAME, SIZE) \
2041 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2042 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2043 }\
2044 \
2045 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2046 uint8_t half[SIZE*SIZE];\
2047 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2048 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2049 }\
2050 \
2051 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2052 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2053 }\
2054 \
2055 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2056 uint8_t half[SIZE*SIZE];\
2057 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2058 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2059 }\
2060 \
2061 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[SIZE*(SIZE+5)];\
2063 uint8_t * const full_mid= full + SIZE*2;\
2064 uint8_t half[SIZE*SIZE];\
2065 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2066 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2067 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2068 }\
2069 \
2070 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2071 uint8_t full[SIZE*(SIZE+5)];\
2072 uint8_t * const full_mid= full + SIZE*2;\
2073 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2074 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2075 }\
2076 \
2077 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2078 uint8_t full[SIZE*(SIZE+5)];\
2079 uint8_t * const full_mid= full + SIZE*2;\
2080 uint8_t half[SIZE*SIZE];\
2081 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2082 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2083 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2084 }\
2085 \
2086 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2087 uint8_t full[SIZE*(SIZE+5)];\
2088 uint8_t * const full_mid= full + SIZE*2;\
2089 uint8_t halfH[SIZE*SIZE];\
2090 uint8_t halfV[SIZE*SIZE];\
2091 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2092 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2093 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2094 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2095 }\
2096 \
2097 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2098 uint8_t full[SIZE*(SIZE+5)];\
2099 uint8_t * const full_mid= full + SIZE*2;\
2100 uint8_t halfH[SIZE*SIZE];\
2101 uint8_t halfV[SIZE*SIZE];\
2102 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2103 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2104 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2105 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2106 }\
2107 \
2108 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2109 uint8_t full[SIZE*(SIZE+5)];\
2110 uint8_t * const full_mid= full + SIZE*2;\
2111 uint8_t halfH[SIZE*SIZE];\
2112 uint8_t halfV[SIZE*SIZE];\
2113 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2114 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2115 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2116 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2117 }\
2118 \
2119 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2120 uint8_t full[SIZE*(SIZE+5)];\
2121 uint8_t * const full_mid= full + SIZE*2;\
2122 uint8_t halfH[SIZE*SIZE];\
2123 uint8_t halfV[SIZE*SIZE];\
2124 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2125 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2126 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2127 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2128 }\
2129 \
2130 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2131 int16_t tmp[SIZE*(SIZE+5)];\
2132 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2133 }\
2134 \
2135 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2136 int16_t tmp[SIZE*(SIZE+5)];\
2137 uint8_t halfH[SIZE*SIZE];\
2138 uint8_t halfHV[SIZE*SIZE];\
2139 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2140 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2142 }\
2143 \
2144 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2145 int16_t tmp[SIZE*(SIZE+5)];\
2146 uint8_t halfH[SIZE*SIZE];\
2147 uint8_t halfHV[SIZE*SIZE];\
2148 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2149 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2150 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2151 }\
2152 \
2153 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2154 uint8_t full[SIZE*(SIZE+5)];\
2155 uint8_t * const full_mid= full + SIZE*2;\
2156 int16_t tmp[SIZE*(SIZE+5)];\
2157 uint8_t halfV[SIZE*SIZE];\
2158 uint8_t halfHV[SIZE*SIZE];\
2159 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2160 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2161 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2162 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2163 }\
2164 \
2165 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2166 uint8_t full[SIZE*(SIZE+5)];\
2167 uint8_t * const full_mid= full + SIZE*2;\
2168 int16_t tmp[SIZE*(SIZE+5)];\
2169 uint8_t halfV[SIZE*SIZE];\
2170 uint8_t halfHV[SIZE*SIZE];\
2171 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2172 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2173 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2174 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2175 }\
2176
2177 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2178 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2179 #define op_put(a, b) a = cm[((b) + 16)>>5]
2180 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2181 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2182
2183 H264_LOWPASS(put_ , op_put, op2_put)
2184 H264_LOWPASS(avg_ , op_avg, op2_avg)
2185 H264_MC(put_, 4)
2186 H264_MC(put_, 8)
2187 H264_MC(put_, 16)
2188 H264_MC(avg_, 4)
2189 H264_MC(avg_, 8)
2190 H264_MC(avg_, 16)
2191
2192 #undef op_avg
2193 #undef op_put
2194 #undef op2_avg
2195 #undef op2_put
2196 #endif
2197
2198 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2199 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2200 int i;
2201
2202 for(i=0; i<h; i++){
2203 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2204 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2205 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2206 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2207 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2208 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2209 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2210 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2211 dst+=dstStride;
2212 src+=srcStride;
2213 }
2214 }
2215
2216 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2217 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2218 int i;
2219
2220 for(i=0; i<w; i++){
2221 const int src_1= src[ -srcStride];
2222 const int src0 = src[0 ];
2223 const int src1 = src[ srcStride];
2224 const int src2 = src[2*srcStride];
2225 const int src3 = src[3*srcStride];
2226 const int src4 = src[4*srcStride];
2227 const int src5 = src[5*srcStride];
2228 const int src6 = src[6*srcStride];
2229 const int src7 = src[7*srcStride];
2230 const int src8 = src[8*srcStride];
2231 const int src9 = src[9*srcStride];
2232 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2233 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2234 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2235 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2236 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2237 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2238 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2239 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2240 src++;
2241 dst++;
2242 }
2243 }
2244
2245 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2246 put_pixels8_c(dst, src, stride, 8);
2247 }
2248
2249 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2250 uint8_t half[64];
2251 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2252 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2253 }
2254
2255 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2256 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2257 }
2258
2259 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2260 uint8_t half[64];
2261 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2262 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2263 }
2264
2265 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2266 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2267 }
2268
2269 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2270 uint8_t halfH[88];
2271 uint8_t halfV[64];
2272 uint8_t halfHV[64];
2273 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2274 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2275 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2276 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2277 }
2278 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2279 uint8_t halfH[88];
2280 uint8_t halfV[64];
2281 uint8_t halfHV[64];
2282 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2283 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2284 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2285 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2286 }
2287 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2288 uint8_t halfH[88];
2289 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2290 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2291 }
2292
2293 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2294 int x;
2295 const int strength= ff_h263_loop_filter_strength[qscale];
2296
2297 for(x=0; x<8; x++){
2298 int d1, d2, ad1;
2299 int p0= src[x-2*stride];
2300 int p1= src[x-1*stride];
2301 int p2= src[x+0*stride];
2302 int p3= src[x+1*stride];
2303 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2304
2305 if (d<-2*strength) d1= 0;
2306 else if(d<- strength) d1=-2*strength - d;
2307 else if(d< strength) d1= d;
2308 else if(d< 2*strength) d1= 2*strength - d;
2309 else d1= 0;
2310
2311 p1 += d1;
2312 p2 -= d1;
2313 if(p1&256) p1= ~(p1>>31);
2314 if(p2&256) p2= ~(p2>>31);
2315
2316 src[x-1*stride] = p1;
2317 src[x+0*stride] = p2;
2318
2319 ad1= ABS(d1)>>1;
2320
2321 d2= clip((p0-p3)/4, -ad1, ad1);
2322
2323 src[x-2*stride] = p0 - d2;
2324 src[x+ stride] = p3 + d2;
2325 }
2326 }
2327
2328 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2329 int y;
2330 const int strength= ff_h263_loop_filter_strength[qscale];
2331
2332 for(y=0; y<8; y++){
2333 int d1, d2, ad1;
2334 int p0= src[y*stride-2];
2335 int p1= src[y*stride-1];
2336 int p2= src[y*stride+0];
2337 int p3= src[y*stride+1];
2338 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2339
2340 if (d<-2*strength) d1= 0;
2341 else if(d<- strength) d1=-2*strength - d;
2342 else if(d< strength) d1= d;
2343 else if(d< 2*strength) d1= 2*strength - d;
2344 else d1= 0;
2345
2346 p1 += d1;
2347 p2 -= d1;
2348 if(p1&256) p1= ~(p1>>31);
2349 if(p2&256) p2= ~(p2>>31);
2350
2351 src[y*stride-1] = p1;
2352 src[y*stride+0] = p2;
2353
2354 ad1= ABS(d1)>>1;
2355
2356 d2= clip((p0-p3)/4, -ad1, ad1);
2357
2358 src[y*stride-2] = p0 - d2;
2359 src[y*stride+1] = p3 + d2;
2360 }
2361 }
2362
2363 static void h261_v_loop_filter_c(uint8_t *dest,uint8_t *src, int stride){
2364 int i,j,xy,yz;
2365 int res;
2366 for(i=0; i<8; i++){
2367 for(j=1; j<7; j++){
2368 xy = j * stride + i;
2369 yz = j * 8 + i;
2370 res = (int)src[yz-1*8] + ((int)(src[yz+0*8]) * 2) + (int)src[yz+1*8];
2371 res +=2;
2372 res >>=2;
2373 dest[xy] = (uint8_t)res;
2374 }
2375 }
2376 }
2377
2378 static void h261_h_loop_filter_c(uint8_t *dest,uint8_t *src, int stride){
2379 int i,j,xy,yz;
2380 int res;
2381 for(i=1; i<7; i++){
2382 for(j=0; j<8; j++){
2383 xy = j * stride + i;
2384 yz = j * 8 + i;
2385 res = (int)src[yz-1] + ((int)(src[yz]) *2) + (int)src[yz+1];
2386 res+=2;
2387 res>>=2;
2388 dest[xy] = (uint8_t)res;
2389 }
2390 }
2391 }
2392
2393 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2394 {
2395 int s, i;
2396
2397 s = 0;
2398 for(i=0;i<h;i++) {
2399 s += abs(pix1[0] - pix2[0]);
2400 s += abs(pix1[1] - pix2[1]);
2401 s += abs(pix1[2] - pix2[2]);
2402 s += abs(pix1[3] - pix2[3]);
2403 s += abs(pix1[4] - pix2[4]);
2404 s += abs(pix1[5] - pix2[5]);
2405 s += abs(pix1[6] - pix2[6]);
2406 s += abs(pix1[7] - pix2[7]);
2407 s += abs(pix1[8] - pix2[8]);
2408 s += abs(pix1[9] - pix2[9]);
2409 s += abs(pix1[10] - pix2[10]);
2410 s += abs(pix1[11] - pix2[11]);
2411 s += abs(pix1[12] - pix2[12]);
2412 s += abs(pix1[13] - pix2[13]);
2413 s += abs(pix1[14] - pix2[14]);
2414 s += abs(pix1[15] - pix2[15]);
2415 pix1 += line_size;
2416 pix2 += line_size;
2417 }
2418 return s;
2419 }
2420
2421 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2422 {
2423 int s, i;
2424
2425 s = 0;
2426 for(i=0;i<h;i++) {
2427 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2428 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2429 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2430 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2431 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2432 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2433 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2434 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2435 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2436 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2437 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2438 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2439 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2440 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2441 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2442 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2443 pix1 += line_size;
2444 pix2 += line_size;
2445 }
2446 return s;
2447 }
2448
2449 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2450 {
2451 int s, i;
2452 uint8_t *pix3 = pix2 + line_size;
2453
2454 s = 0;
2455 for(i=0;i<h;i++) {
2456 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2457 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2458 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2459 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2460 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2461 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2462 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2463 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2464 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2465 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2466 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2467 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2468 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2469 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2470 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2471 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2472 pix1 += line_size;
2473 pix2 += line_size;
2474 pix3 += line_size;
2475 }
2476 return s;
2477 }
2478
2479 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2480 {
2481 int s, i;
2482 uint8_t *pix3 = pix2 + line_size;
2483
2484 s = 0;
2485 for(i=0;i<h;i++) {
2486 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2487 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2488 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2489 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2490 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2491 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2492 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2493 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2494 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2495 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2496 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2497 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2498 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2499 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2500 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2501 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2502 pix1 += line_size;
2503 pix2 += line_size;
2504 pix3 += line_size;
2505 }
2506 return s;
2507 }
2508
2509 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2510 {
2511 int s, i;
2512
2513 s = 0;
2514 for(i=0;i<h;i++) {
2515 s += abs(pix1[0] - pix2[0]);
2516 s += abs(pix1[1] - pix2[1]);
2517 s += abs(pix1[2] - pix2[2]);
2518 s += abs(pix1[3] - pix2[3]);
2519 s += abs(pix1[4] - pix2[4]);
2520 s += abs(pix1[5] - pix2[5]);
2521 s += abs(pix1[6] - pix2[6]);
2522 s += abs(pix1[7] - pix2[7]);
2523 pix1 += line_size;
2524 pix2 += line_size;
2525 }
2526 return s;
2527 }
2528
2529 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2530 {
2531 int s, i;
2532
2533 s = 0;
2534 for(i=0;i<h;i++) {
2535 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2536 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2537 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2538 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2539 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2540 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2541 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2542 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2543 pix1 += line_size;
2544 pix2 += line_size;
2545 }
2546 return s;
2547 }
2548
2549 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2550 {
2551 int s, i;
2552 uint8_t *pix3 = pix2 + line_size;
2553
2554 s = 0;
2555 for(i=0;i<h;i++) {
2556 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2557 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2558 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2559 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2560 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2561 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2562 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2563 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2564 pix1 += line_size;
2565 pix2 += line_size;
2566 pix3 += line_size;
2567 }
2568 return s;
2569 }
2570
2571 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2572 {
2573 int s, i;
2574 uint8_t *pix3 = pix2 + line_size;
2575
2576 s = 0;
2577 for(i=0;i<h;i++) {
2578 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2579 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2580 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2581 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2582 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2583 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2584 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2585 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2586 pix1 += line_size;
2587 pix2 += line_size;
2588 pix3 += line_size;
2589 }
2590 return s;
2591 }
2592
2593 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2594 int i;
2595 unsigned int sum=0;
2596
2597 for(i=0; i<8*8; i++){
2598 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2599 int w= weight[i];
2600 b>>= RECON_SHIFT;
2601 assert(-512<b && b<512);
2602
2603 sum += (w*b)*(w*b)>>4;
2604 }
2605 return sum>>2;
2606 }
2607
2608 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2609 int i;
2610
2611 for(i=0; i<8*8; i++){
2612 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2613 }
2614 }
2615
2616 /**
2617 * permutes an 8x8 block.
2618 * @param block the block which will be permuted according to the given permutation vector
2619 * @param permutation the permutation vector
2620 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2621 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2622 * (inverse) permutated to scantable order!
2623 */
2624 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2625 {
2626 int i;
2627 DCTELEM temp[64];
2628
2629 if(last<=0) return;
2630 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2631
2632 for(i=0; i<=last; i++){
2633 const int j= scantable[i];
2634 temp[j]= block[j];
2635 block[j]=0;
2636 }
2637
2638 for(i=0; i<=last; i++){
2639 const int j= scantable[i];
2640 const int perm_j= permutation[j];
2641 block[perm_j]= temp[j];
2642 }
2643 }
2644
2645 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2646 return 0;
2647 }
2648
2649 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2650 int i;
2651
2652 memset(cmp, 0, sizeof(void*)*5);
2653
2654 for(i=0; i<5; i++){
2655 switch(type&0xFF){
2656 case FF_CMP_SAD:
2657 cmp[i]= c->sad[i];
2658 break;
2659 case FF_CMP_SATD:
2660 cmp[i]= c->hadamard8_diff[i];
2661 break;
2662 case FF_CMP_SSE:
2663 cmp[i]= c->sse[i];
2664 break;
2665 case FF_CMP_DCT:
2666 cmp[i]= c->dct_sad[i];
2667 break;
2668 case FF_CMP_PSNR:
2669 cmp[i]= c->quant_psnr[i];
2670 break;
2671 case FF_CMP_BIT:
2672 cmp[i]= c->bit[i];
2673 break;
2674 case FF_CMP_RD:
2675 cmp[i]= c->rd[i];
2676 break;
2677 case FF_CMP_VSAD:
2678 cmp[i]= c->vsad[i];
2679 break;
2680 case FF_CMP_VSSE:
2681 cmp[i]= c->vsse[i];
2682 break;
2683 case FF_CMP_ZERO:
2684 cmp[i]= zero_cmp;
2685 break;
2686 default:
2687 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2688 }
2689 }
2690 }
2691
2692 /**
2693 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2694 */
2695 static void clear_blocks_c(DCTELEM *blocks)
2696 {
2697 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2698 }
2699
2700 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2701 int i;
2702 for(i=0; i+7<w; i+=8){
2703 dst[i+0] += src[i+0];
2704 dst[i+1] += src[i+1];
2705 dst[i+2] += src[i+2];
2706 dst[i+3] += src[i+3];
2707 dst[i+4] += src[i+4];
2708 dst[i+5] += src[i+5];
2709 dst[i+6] += src[i+6];
2710 dst[i+7] += src[i+7];
2711 }
2712 for(; i<w; i++)
2713 dst[i+0] += src[i+0];
2714 }
2715
2716 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2717 int i;
2718 for(i=0; i+7<w; i+=8){
2719 dst[i+0] = src1[i+0]-src2[i+0];
2720 dst[i+1] = src1[i+1]-src2[i+1];
2721 dst[i+2] = src1[i+2]-src2[i+2];
2722 dst[i+3] = src1[i+3]-src2[i+3];
2723 dst[i+4] = src1[i+4]-src2[i+4];
2724 dst[i+5] = src1[i+5]-src2[i+5];
2725 dst[i+6] = src1[i+6]-src2[i+6];
2726 dst[i+7] = src1[i+7]-src2[i+7];
2727 }
2728 for(; i<w; i++)
2729 dst[i+0] = src1[i+0]-src2[i+0];
2730 }
2731
2732 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2733 int i;
2734 uint8_t l, lt;
2735
2736 l= *left;
2737 lt= *left_top;
2738
2739 for(i=0; i<w; i++){
2740 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2741 lt= src1[i];
2742 l= src2[i];
2743 dst[i]= l - pred;
2744 }
2745
2746 *left= l;
2747 *left_top= lt;
2748 }
2749
2750 #define BUTTERFLY2(o1,o2,i1,i2) \
2751 o1= (i1)+(i2);\
2752 o2= (i1)-(i2);
2753
2754 #define BUTTERFLY1(x,y) \
2755 {\
2756 int a,b;\
2757 a= x;\
2758 b= y;\
2759 x= a+b;\
2760 y= a-b;\
2761 }
2762
2763 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2764
2765 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2766 int i;
2767 int temp[64];
2768 int sum=0;
2769
2770 assert(h==8);
2771
2772 for(i=0; i<8; i++){
2773 //FIXME try pointer walks
2774 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2775 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2776 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2777 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2778
2779 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2780 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2781 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2782 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2783
2784 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2785 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2786 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2787 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2788 }
2789
2790 for(i=0; i<8; i++){
2791 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2792 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2793 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2794 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2795
2796 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2797 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2798 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2799 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2800
2801 sum +=
2802 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2803 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2804 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2805 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2806 }
2807 #if 0
2808 static int maxi=0;
2809 if(sum>maxi){
2810 maxi=sum;
2811 printf("MAX:%d\n", maxi);
2812 }
2813 #endif
2814 return sum;
2815 }
2816
2817 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2818 int i;
2819 int temp[64];
2820 int sum=0;
2821
2822 assert(h==8);
2823
2824 for(i=0; i<8; i++){
2825 //FIXME try pointer walks
2826 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2827 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2828 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2829 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2830
2831 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2832 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2833 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2834 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2835
2836 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2837 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2838 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2839 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2840 }
2841
2842 for(i=0; i<8; i++){
2843 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2844 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2845 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2846 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2847
2848 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2849 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2850 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2851 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2852
2853 sum +=
2854 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2855 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2856 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2857 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2858 }
2859
2860 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2861
2862 return sum;
2863 }
2864
2865 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2866 MpegEncContext * const s= (MpegEncContext *)c;
2867 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2868 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2869 int sum=0, i;
2870
2871 assert(h==8);
2872
2873 s->dsp.diff_pixels(temp, src1, src2, stride);
2874 s->dsp.fdct(temp);
2875
2876 for(i=0; i<64; i++)
2877 sum+= ABS(temp[i]);
2878
2879 return sum;
2880 }
2881
2882 void simple_idct(DCTELEM *block); //FIXME
2883
2884 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2885 MpegEncContext * const s= (MpegEncContext *)c;
2886 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2887 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2888 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2889 int sum=0, i;
2890
2891 assert(h==8);
2892 s->mb_intra=0;
2893
2894 s->dsp.diff_pixels(temp, src1, src2, stride);
2895
2896 memcpy(bak, temp, 64*sizeof(DCTELEM));
2897
2898 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2899 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2900 simple_idct(temp); //FIXME
2901
2902 for(i=0; i<64; i++)
2903 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2904
2905 return sum;
2906 }
2907
2908 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2909 MpegEncContext * const s= (MpegEncContext *)c;
2910 const uint8_t *scantable= s->intra_scantable.permutated;
2911 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2912 uint64_t __align8 aligned_bak[stride];
2913 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2914 uint8_t * const bak= (uint8_t*)aligned_bak;
2915 int i, last, run, bits, level, distoration, start_i;
2916 const int esc_length= s->ac_esc_length;
2917 uint8_t * length;
2918 uint8_t * last_length;
2919
2920 assert(h==8);
2921
2922 for(i=0; i<8; i++){
2923 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2924 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2925 }
2926
2927 s->dsp.diff_pixels(temp, src1, src2, stride);
2928
2929 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2930
2931 bits=0;
2932
2933 if (s->mb_intra) {
2934 start_i = 1;
2935 length = s->intra_ac_vlc_length;
2936 last_length= s->intra_ac_vlc_last_length;
2937 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2938 } else {
2939 start_i = 0;
2940 length = s->inter_ac_vlc_length;
2941 last_length= s->inter_ac_vlc_last_length;
2942 }
2943
2944 if(last>=start_i){
2945 run=0;
2946 for(i=start_i; i<last; i++){
2947 int j= scantable[i];
2948 level= temp[j];
2949
2950 if(level){
2951 level+=64;
2952 if((level&(~127)) == 0){
2953 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2954 }else
2955 bits+= esc_length;
2956 run=0;
2957 }else
2958 run++;
2959 }
2960 i= scantable[last];
2961
2962 level= temp[i] + 64;
2963
2964 assert(level - 64);
2965
2966 if((level&(~127)) == 0){
2967 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2968 }else
2969 bits+= esc_length;
2970
2971 }
2972
2973 if(last>=0){
2974 if(s->mb_intra)
2975 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2976 else
2977 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2978 }
2979
2980 s->dsp.idct_add(bak, stride, temp);
2981
2982 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2983
2984 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2985 }
2986
2987 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2988 MpegEncContext * const s= (MpegEncContext *)c;
2989 const uint8_t *scantable= s->intra_scantable.permutated;
2990 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2991 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2992 int i, last, run, bits, level, start_i;
2993 const int esc_length= s->ac_esc_length;
2994 uint8_t * length;
2995 uint8_t * last_length;
2996
2997 assert(h==8);
2998
2999 s->dsp.diff_pixels(temp, src1, src2, stride);
3000
3001 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3002
3003 bits=0;