useless
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
32 #include "faandct.h"
33
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
35 uint32_t squareTbl[512] = {0, };
36
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46 };
47
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59 };
60
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
63
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73 };
74
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84 };
85
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120 };
121
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 };
133
134 static int pix_sum_c(uint8_t * pix, int line_size)
135 {
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154 }
155
156 static int pix_norm1_c(uint8_t * pix, int line_size)
157 {
158 int s, i, j;
159 uint32_t *sq = squareTbl + 256;
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
164 #if 0
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
173 #else
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184 #else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195 #endif
196 #endif
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202 }
203
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220 }
221
222 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223 {
224 int s, i;
225 uint32_t *sq = squareTbl + 256;
226
227 s = 0;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 pix1 += line_size;
234 pix2 += line_size;
235 }
236 return s;
237 }
238
239 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
240 {
241 int s, i;
242 uint32_t *sq = squareTbl + 256;
243
244 s = 0;
245 for (i = 0; i < h; i++) {
246 s += sq[pix1[0] - pix2[0]];
247 s += sq[pix1[1] - pix2[1]];
248 s += sq[pix1[2] - pix2[2]];
249 s += sq[pix1[3] - pix2[3]];
250 s += sq[pix1[4] - pix2[4]];
251 s += sq[pix1[5] - pix2[5]];
252 s += sq[pix1[6] - pix2[6]];
253 s += sq[pix1[7] - pix2[7]];
254 pix1 += line_size;
255 pix2 += line_size;
256 }
257 return s;
258 }
259
260 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
261 {
262 int s, i;
263 uint32_t *sq = squareTbl + 256;
264
265 s = 0;
266 for (i = 0; i < h; i++) {
267 s += sq[pix1[ 0] - pix2[ 0]];
268 s += sq[pix1[ 1] - pix2[ 1]];
269 s += sq[pix1[ 2] - pix2[ 2]];
270 s += sq[pix1[ 3] - pix2[ 3]];
271 s += sq[pix1[ 4] - pix2[ 4]];
272 s += sq[pix1[ 5] - pix2[ 5]];
273 s += sq[pix1[ 6] - pix2[ 6]];
274 s += sq[pix1[ 7] - pix2[ 7]];
275 s += sq[pix1[ 8] - pix2[ 8]];
276 s += sq[pix1[ 9] - pix2[ 9]];
277 s += sq[pix1[10] - pix2[10]];
278 s += sq[pix1[11] - pix2[11]];
279 s += sq[pix1[12] - pix2[12]];
280 s += sq[pix1[13] - pix2[13]];
281 s += sq[pix1[14] - pix2[14]];
282 s += sq[pix1[15] - pix2[15]];
283
284 pix1 += line_size;
285 pix2 += line_size;
286 }
287 return s;
288 }
289
290
291 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
292 int s, i, j;
293 const int dec_count= w==8 ? 3 : 4;
294 int tmp[16*16];
295 #if 0
296 int level, ori;
297 static const int scale[2][2][4][4]={
298 {
299 {
300 //8x8 dec=3
301 {268, 239, 239, 213},
302 { 0, 224, 224, 152},
303 { 0, 135, 135, 110},
304 },{
305 //16x16 dec=4
306 {344, 310, 310, 280},
307 { 0, 320, 320, 228},
308 { 0, 175, 175, 136},
309 { 0, 129, 129, 102},
310 }
311 },{
312 {//FIXME 5/3
313 //8x8 dec=3
314 {275, 245, 245, 218},
315 { 0, 230, 230, 156},
316 { 0, 138, 138, 113},
317 },{
318 //16x16 dec=4
319 {352, 317, 317, 286},
320 { 0, 328, 328, 233},
321 { 0, 180, 180, 140},
322 { 0, 132, 132, 105},
323 }
324 }
325 };
326 #endif
327
328 for (i = 0; i < h; i++) {
329 for (j = 0; j < w; j+=4) {
330 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
331 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
332 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
333 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
334 }
335 pix1 += line_size;
336 pix2 += line_size;
337 }
338 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
339
340 s=0;
341 #if 0
342 for(level=0; level<dec_count; level++){
343 for(ori= level ? 1 : 0; ori<4; ori++){
344 int sx= (ori&1) ? 1<<level: 0;
345 int stride= 16<<(dec_count-level);
346 int sy= (ori&2) ? stride>>1 : 0;
347 int size= 1<<level;
348
349 for(i=0; i<size; i++){
350 for(j=0; j<size; j++){
351 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
352 s += ABS(v);
353 }
354 }
355 }
356 }
357 #endif
358 for (i = 0; i < h; i++) {
359 for (j = 0; j < w; j+=4) {
360 s+= ABS(tmp[16*i+j+0]);
361 s+= ABS(tmp[16*i+j+1]);
362 s+= ABS(tmp[16*i+j+2]);
363 s+= ABS(tmp[16*i+j+3]);
364 }
365 }
366 assert(s>=0);
367
368 return s>>2;
369 }
370
371 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372 return w_c(v, pix1, pix2, line_size, 8, h, 1);
373 }
374
375 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376 return w_c(v, pix1, pix2, line_size, 8, h, 0);
377 }
378
379 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380 return w_c(v, pix1, pix2, line_size, 16, h, 1);
381 }
382
383 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384 return w_c(v, pix1, pix2, line_size, 16, h, 0);
385 }
386
387 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
388 {
389 int i;
390
391 /* read the pixels */
392 for(i=0;i<8;i++) {
393 block[0] = pixels[0];
394 block[1] = pixels[1];
395 block[2] = pixels[2];
396 block[3] = pixels[3];
397 block[4] = pixels[4];
398 block[5] = pixels[5];
399 block[6] = pixels[6];
400 block[7] = pixels[7];
401 pixels += line_size;
402 block += 8;
403 }
404 }
405
406 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
407 const uint8_t *s2, int stride){
408 int i;
409
410 /* read the pixels */
411 for(i=0;i<8;i++) {
412 block[0] = s1[0] - s2[0];
413 block[1] = s1[1] - s2[1];
414 block[2] = s1[2] - s2[2];
415 block[3] = s1[3] - s2[3];
416 block[4] = s1[4] - s2[4];
417 block[5] = s1[5] - s2[5];
418 block[6] = s1[6] - s2[6];
419 block[7] = s1[7] - s2[7];
420 s1 += stride;
421 s2 += stride;
422 block += 8;
423 }
424 }
425
426
427 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
428 int line_size)
429 {
430 int i;
431 uint8_t *cm = cropTbl + MAX_NEG_CROP;
432
433 /* read the pixels */
434 for(i=0;i<8;i++) {
435 pixels[0] = cm[block[0]];
436 pixels[1] = cm[block[1]];
437 pixels[2] = cm[block[2]];
438 pixels[3] = cm[block[3]];
439 pixels[4] = cm[block[4]];
440 pixels[5] = cm[block[5]];
441 pixels[6] = cm[block[6]];
442 pixels[7] = cm[block[7]];
443
444 pixels += line_size;
445 block += 8;
446 }
447 }
448
449 static void put_signed_pixels_clamped_c(const DCTELEM *block,
450 uint8_t *restrict pixels,
451 int line_size)
452 {
453 int i, j;
454
455 for (i = 0; i < 8; i++) {
456 for (j = 0; j < 8; j++) {
457 if (*block < -128)
458 *pixels = 0;
459 else if (*block > 127)
460 *pixels = 255;
461 else
462 *pixels = (uint8_t)(*block + 128);
463 block++;
464 pixels++;
465 }
466 pixels += (line_size - 8);
467 }
468 }
469
470 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
471 int line_size)
472 {
473 int i;
474 uint8_t *cm = cropTbl + MAX_NEG_CROP;
475
476 /* read the pixels */
477 for(i=0;i<8;i++) {
478 pixels[0] = cm[pixels[0] + block[0]];
479 pixels[1] = cm[pixels[1] + block[1]];
480 pixels[2] = cm[pixels[2] + block[2]];
481 pixels[3] = cm[pixels[3] + block[3]];
482 pixels[4] = cm[pixels[4] + block[4]];
483 pixels[5] = cm[pixels[5] + block[5]];
484 pixels[6] = cm[pixels[6] + block[6]];
485 pixels[7] = cm[pixels[7] + block[7]];
486 pixels += line_size;
487 block += 8;
488 }
489 }
490 #if 0
491
492 #define PIXOP2(OPNAME, OP) \
493 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
494 {\
495 int i;\
496 for(i=0; i<h; i++){\
497 OP(*((uint64_t*)block), LD64(pixels));\
498 pixels+=line_size;\
499 block +=line_size;\
500 }\
501 }\
502 \
503 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
504 {\
505 int i;\
506 for(i=0; i<h; i++){\
507 const uint64_t a= LD64(pixels );\
508 const uint64_t b= LD64(pixels+1);\
509 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
510 pixels+=line_size;\
511 block +=line_size;\
512 }\
513 }\
514 \
515 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
516 {\
517 int i;\
518 for(i=0; i<h; i++){\
519 const uint64_t a= LD64(pixels );\
520 const uint64_t b= LD64(pixels+1);\
521 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
522 pixels+=line_size;\
523 block +=line_size;\
524 }\
525 }\
526 \
527 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
528 {\
529 int i;\
530 for(i=0; i<h; i++){\
531 const uint64_t a= LD64(pixels );\
532 const uint64_t b= LD64(pixels+line_size);\
533 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
534 pixels+=line_size;\
535 block +=line_size;\
536 }\
537 }\
538 \
539 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
540 {\
541 int i;\
542 for(i=0; i<h; i++){\
543 const uint64_t a= LD64(pixels );\
544 const uint64_t b= LD64(pixels+line_size);\
545 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
546 pixels+=line_size;\
547 block +=line_size;\
548 }\
549 }\
550 \
551 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
552 {\
553 int i;\
554 const uint64_t a= LD64(pixels );\
555 const uint64_t b= LD64(pixels+1);\
556 uint64_t l0= (a&0x0303030303030303ULL)\
557 + (b&0x0303030303030303ULL)\
558 + 0x0202020202020202ULL;\
559 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
560 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
561 uint64_t l1,h1;\
562 \
563 pixels+=line_size;\
564 for(i=0; i<h; i+=2){\
565 uint64_t a= LD64(pixels );\
566 uint64_t b= LD64(pixels+1);\
567 l1= (a&0x0303030303030303ULL)\
568 + (b&0x0303030303030303ULL);\
569 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
570 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
571 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
572 pixels+=line_size;\
573 block +=line_size;\
574 a= LD64(pixels );\
575 b= LD64(pixels+1);\
576 l0= (a&0x0303030303030303ULL)\
577 + (b&0x0303030303030303ULL)\
578 + 0x0202020202020202ULL;\
579 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
580 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
581 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
582 pixels+=line_size;\
583 block +=line_size;\
584 }\
585 }\
586 \
587 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
588 {\
589 int i;\
590 const uint64_t a= LD64(pixels );\
591 const uint64_t b= LD64(pixels+1);\
592 uint64_t l0= (a&0x0303030303030303ULL)\
593 + (b&0x0303030303030303ULL)\
594 + 0x0101010101010101ULL;\
595 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
596 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
597 uint64_t l1,h1;\
598 \
599 pixels+=line_size;\
600 for(i=0; i<h; i+=2){\
601 uint64_t a= LD64(pixels );\
602 uint64_t b= LD64(pixels+1);\
603 l1= (a&0x0303030303030303ULL)\
604 + (b&0x0303030303030303ULL);\
605 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
606 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
607 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
608 pixels+=line_size;\
609 block +=line_size;\
610 a= LD64(pixels );\
611 b= LD64(pixels+1);\
612 l0= (a&0x0303030303030303ULL)\
613 + (b&0x0303030303030303ULL)\
614 + 0x0101010101010101ULL;\
615 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
616 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
617 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
618 pixels+=line_size;\
619 block +=line_size;\
620 }\
621 }\
622 \
623 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
624 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
625 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
626 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
627 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
628 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
629 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
630
631 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
632 #else // 64 bit variant
633
634 #define PIXOP2(OPNAME, OP) \
635 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
636 int i;\
637 for(i=0; i<h; i++){\
638 OP(*((uint16_t*)(block )), LD16(pixels ));\
639 pixels+=line_size;\
640 block +=line_size;\
641 }\
642 }\
643 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
644 int i;\
645 for(i=0; i<h; i++){\
646 OP(*((uint32_t*)(block )), LD32(pixels ));\
647 pixels+=line_size;\
648 block +=line_size;\
649 }\
650 }\
651 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
652 int i;\
653 for(i=0; i<h; i++){\
654 OP(*((uint32_t*)(block )), LD32(pixels ));\
655 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
656 pixels+=line_size;\
657 block +=line_size;\
658 }\
659 }\
660 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
661 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
662 }\
663 \
664 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
665 int src_stride1, int src_stride2, int h){\
666 int i;\
667 for(i=0; i<h; i++){\
668 uint32_t a,b;\
669 a= LD32(&src1[i*src_stride1 ]);\
670 b= LD32(&src2[i*src_stride2 ]);\
671 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
672 a= LD32(&src1[i*src_stride1+4]);\
673 b= LD32(&src2[i*src_stride2+4]);\
674 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
675 }\
676 }\
677 \
678 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
679 int src_stride1, int src_stride2, int h){\
680 int i;\
681 for(i=0; i<h; i++){\
682 uint32_t a,b;\
683 a= LD32(&src1[i*src_stride1 ]);\
684 b= LD32(&src2[i*src_stride2 ]);\
685 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
686 a= LD32(&src1[i*src_stride1+4]);\
687 b= LD32(&src2[i*src_stride2+4]);\
688 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
689 }\
690 }\
691 \
692 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
693 int src_stride1, int src_stride2, int h){\
694 int i;\
695 for(i=0; i<h; i++){\
696 uint32_t a,b;\
697 a= LD32(&src1[i*src_stride1 ]);\
698 b= LD32(&src2[i*src_stride2 ]);\
699 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
700 }\
701 }\
702 \
703 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
704 int src_stride1, int src_stride2, int h){\
705 int i;\
706 for(i=0; i<h; i++){\
707 uint32_t a,b;\
708 a= LD16(&src1[i*src_stride1 ]);\
709 b= LD16(&src2[i*src_stride2 ]);\
710 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
711 }\
712 }\
713 \
714 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
715 int src_stride1, int src_stride2, int h){\
716 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
717 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
718 }\
719 \
720 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
721 int src_stride1, int src_stride2, int h){\
722 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
723 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
724 }\
725 \
726 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
727 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
728 }\
729 \
730 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
731 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
732 }\
733 \
734 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
735 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
736 }\
737 \
738 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
739 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
740 }\
741 \
742 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
743 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
744 int i;\
745 for(i=0; i<h; i++){\
746 uint32_t a, b, c, d, l0, l1, h0, h1;\
747 a= LD32(&src1[i*src_stride1]);\
748 b= LD32(&src2[i*src_stride2]);\
749 c= LD32(&src3[i*src_stride3]);\
750 d= LD32(&src4[i*src_stride4]);\
751 l0= (a&0x03030303UL)\
752 + (b&0x03030303UL)\
753 + 0x02020202UL;\
754 h0= ((a&0xFCFCFCFCUL)>>2)\
755 + ((b&0xFCFCFCFCUL)>>2);\
756 l1= (c&0x03030303UL)\
757 + (d&0x03030303UL);\
758 h1= ((c&0xFCFCFCFCUL)>>2)\
759 + ((d&0xFCFCFCFCUL)>>2);\
760 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
761 a= LD32(&src1[i*src_stride1+4]);\
762 b= LD32(&src2[i*src_stride2+4]);\
763 c= LD32(&src3[i*src_stride3+4]);\
764 d= LD32(&src4[i*src_stride4+4]);\
765 l0= (a&0x03030303UL)\
766 + (b&0x03030303UL)\
767 + 0x02020202UL;\
768 h0= ((a&0xFCFCFCFCUL)>>2)\
769 + ((b&0xFCFCFCFCUL)>>2);\
770 l1= (c&0x03030303UL)\
771 + (d&0x03030303UL);\
772 h1= ((c&0xFCFCFCFCUL)>>2)\
773 + ((d&0xFCFCFCFCUL)>>2);\
774 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
775 }\
776 }\
777 \
778 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
779 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
780 }\
781 \
782 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
783 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
784 }\
785 \
786 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
787 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
788 }\
789 \
790 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
791 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
792 }\
793 \
794 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
795 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
796 int i;\
797 for(i=0; i<h; i++){\
798 uint32_t a, b, c, d, l0, l1, h0, h1;\
799 a= LD32(&src1[i*src_stride1]);\
800 b= LD32(&src2[i*src_stride2]);\
801 c= LD32(&src3[i*src_stride3]);\
802 d= LD32(&src4[i*src_stride4]);\
803 l0= (a&0x03030303UL)\
804 + (b&0x03030303UL)\
805 + 0x01010101UL;\
806 h0= ((a&0xFCFCFCFCUL)>>2)\
807 + ((b&0xFCFCFCFCUL)>>2);\
808 l1= (c&0x03030303UL)\
809 + (d&0x03030303UL);\
810 h1= ((c&0xFCFCFCFCUL)>>2)\
811 + ((d&0xFCFCFCFCUL)>>2);\
812 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
813 a= LD32(&src1[i*src_stride1+4]);\
814 b= LD32(&src2[i*src_stride2+4]);\
815 c= LD32(&src3[i*src_stride3+4]);\
816 d= LD32(&src4[i*src_stride4+4]);\
817 l0= (a&0x03030303UL)\
818 + (b&0x03030303UL)\
819 + 0x01010101UL;\
820 h0= ((a&0xFCFCFCFCUL)>>2)\
821 + ((b&0xFCFCFCFCUL)>>2);\
822 l1= (c&0x03030303UL)\
823 + (d&0x03030303UL);\
824 h1= ((c&0xFCFCFCFCUL)>>2)\
825 + ((d&0xFCFCFCFCUL)>>2);\
826 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
827 }\
828 }\
829 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
830 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
831 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
832 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
833 }\
834 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
835 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
836 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
837 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
838 }\
839 \
840 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
841 {\
842 int i, a0, b0, a1, b1;\
843 a0= pixels[0];\
844 b0= pixels[1] + 2;\
845 a0 += b0;\
846 b0 += pixels[2];\
847 \
848 pixels+=line_size;\
849 for(i=0; i<h; i+=2){\
850 a1= pixels[0];\
851 b1= pixels[1];\
852 a1 += b1;\
853 b1 += pixels[2];\
854 \
855 block[0]= (a1+a0)>>2; /* FIXME non put */\
856 block[1]= (b1+b0)>>2;\
857 \
858 pixels+=line_size;\
859 block +=line_size;\
860 \
861 a0= pixels[0];\
862 b0= pixels[1] + 2;\
863 a0 += b0;\
864 b0 += pixels[2];\
865 \
866 block[0]= (a1+a0)>>2;\
867 block[1]= (b1+b0)>>2;\
868 pixels+=line_size;\
869 block +=line_size;\
870 }\
871 }\
872 \
873 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
874 {\
875 int i;\
876 const uint32_t a= LD32(pixels );\
877 const uint32_t b= LD32(pixels+1);\
878 uint32_t l0= (a&0x03030303UL)\
879 + (b&0x03030303UL)\
880 + 0x02020202UL;\
881 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
882 + ((b&0xFCFCFCFCUL)>>2);\
883 uint32_t l1,h1;\
884 \
885 pixels+=line_size;\
886 for(i=0; i<h; i+=2){\
887 uint32_t a= LD32(pixels );\
888 uint32_t b= LD32(pixels+1);\
889 l1= (a&0x03030303UL)\
890 + (b&0x03030303UL);\
891 h1= ((a&0xFCFCFCFCUL)>>2)\
892 + ((b&0xFCFCFCFCUL)>>2);\
893 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
894 pixels+=line_size;\
895 block +=line_size;\
896 a= LD32(pixels );\
897 b= LD32(pixels+1);\
898 l0= (a&0x03030303UL)\
899 + (b&0x03030303UL)\
900 + 0x02020202UL;\
901 h0= ((a&0xFCFCFCFCUL)>>2)\
902 + ((b&0xFCFCFCFCUL)>>2);\
903 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
904 pixels+=line_size;\
905 block +=line_size;\
906 }\
907 }\
908 \
909 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
910 {\
911 int j;\
912 for(j=0; j<2; j++){\
913 int i;\
914 const uint32_t a= LD32(pixels );\
915 const uint32_t b= LD32(pixels+1);\
916 uint32_t l0= (a&0x03030303UL)\
917 + (b&0x03030303UL)\
918 + 0x02020202UL;\
919 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
920 + ((b&0xFCFCFCFCUL)>>2);\
921 uint32_t l1,h1;\
922 \
923 pixels+=line_size;\
924 for(i=0; i<h; i+=2){\
925 uint32_t a= LD32(pixels );\
926 uint32_t b= LD32(pixels+1);\
927 l1= (a&0x03030303UL)\
928 + (b&0x03030303UL);\
929 h1= ((a&0xFCFCFCFCUL)>>2)\
930 + ((b&0xFCFCFCFCUL)>>2);\
931 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
932 pixels+=line_size;\
933 block +=line_size;\
934 a= LD32(pixels );\
935 b= LD32(pixels+1);\
936 l0= (a&0x03030303UL)\
937 + (b&0x03030303UL)\
938 + 0x02020202UL;\
939 h0= ((a&0xFCFCFCFCUL)>>2)\
940 + ((b&0xFCFCFCFCUL)>>2);\
941 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
942 pixels+=line_size;\
943 block +=line_size;\
944 }\
945 pixels+=4-line_size*(h+1);\
946 block +=4-line_size*h;\
947 }\
948 }\
949 \
950 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
951 {\
952 int j;\
953 for(j=0; j<2; j++){\
954 int i;\
955 const uint32_t a= LD32(pixels );\
956 const uint32_t b= LD32(pixels+1);\
957 uint32_t l0= (a&0x03030303UL)\
958 + (b&0x03030303UL)\
959 + 0x01010101UL;\
960 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
961 + ((b&0xFCFCFCFCUL)>>2);\
962 uint32_t l1,h1;\
963 \
964 pixels+=line_size;\
965 for(i=0; i<h; i+=2){\
966 uint32_t a= LD32(pixels );\
967 uint32_t b= LD32(pixels+1);\
968 l1= (a&0x03030303UL)\
969 + (b&0x03030303UL);\
970 h1= ((a&0xFCFCFCFCUL)>>2)\
971 + ((b&0xFCFCFCFCUL)>>2);\
972 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
973 pixels+=line_size;\
974 block +=line_size;\
975 a= LD32(pixels );\
976 b= LD32(pixels+1);\
977 l0= (a&0x03030303UL)\
978 + (b&0x03030303UL)\
979 + 0x01010101UL;\
980 h0= ((a&0xFCFCFCFCUL)>>2)\
981 + ((b&0xFCFCFCFCUL)>>2);\
982 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
983 pixels+=line_size;\
984 block +=line_size;\
985 }\
986 pixels+=4-line_size*(h+1);\
987 block +=4-line_size*h;\
988 }\
989 }\
990 \
991 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
992 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
993 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
994 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
995 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
996 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
997 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
998 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
999
1000 #define op_avg(a, b) a = rnd_avg32(a, b)
1001 #endif
1002 #define op_put(a, b) a = b
1003
1004 PIXOP2(avg, op_avg)
1005 PIXOP2(put, op_put)
1006 #undef op_avg
1007 #undef op_put
1008
1009 #define avg2(a,b) ((a+b+1)>>1)
1010 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1011
1012 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1013 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1014 }
1015
1016 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1017 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1018 }
1019
1020 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1021 {
1022 const int A=(16-x16)*(16-y16);
1023 const int B=( x16)*(16-y16);
1024 const int C=(16-x16)*( y16);
1025 const int D=( x16)*( y16);
1026 int i;
1027
1028 for(i=0; i<h; i++)
1029 {
1030 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1031 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1032 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1033 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1034 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1035 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1036 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1037 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1038 dst+= stride;
1039 src+= stride;
1040 }
1041 }
1042
1043 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1044 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1045 {
1046 int y, vx, vy;
1047 const int s= 1<<shift;
1048
1049 width--;
1050 height--;
1051
1052 for(y=0; y<h; y++){
1053 int x;
1054
1055 vx= ox;
1056 vy= oy;
1057 for(x=0; x<8; x++){ //XXX FIXME optimize
1058 int src_x, src_y, frac_x, frac_y, index;
1059
1060 src_x= vx>>16;
1061 src_y= vy>>16;
1062 frac_x= src_x&(s-1);
1063 frac_y= src_y&(s-1);
1064 src_x>>=shift;
1065 src_y>>=shift;
1066
1067 if((unsigned)src_x < width){
1068 if((unsigned)src_y < height){
1069 index= src_x + src_y*stride;
1070 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1071 + src[index +1]* frac_x )*(s-frac_y)
1072 + ( src[index+stride ]*(s-frac_x)
1073 + src[index+stride+1]* frac_x )* frac_y
1074 + r)>>(shift*2);
1075 }else{
1076 index= src_x + clip(src_y, 0, height)*stride;
1077 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1078 + src[index +1]* frac_x )*s
1079 + r)>>(shift*2);
1080 }
1081 }else{
1082 if((unsigned)src_y < height){
1083 index= clip(src_x, 0, width) + src_y*stride;
1084 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1085 + src[index+stride ]* frac_y )*s
1086 + r)>>(shift*2);
1087 }else{
1088 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1089 dst[y*stride + x]= src[index ];
1090 }
1091 }
1092
1093 vx+= dxx;
1094 vy+= dyx;
1095 }
1096 ox += dxy;
1097 oy += dyy;
1098 }
1099 }
1100
1101 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1102 switch(width){
1103 case 2: put_pixels2_c (dst, src, stride, height); break;
1104 case 4: put_pixels4_c (dst, src, stride, height); break;
1105 case 8: put_pixels8_c (dst, src, stride, height); break;
1106 case 16:put_pixels16_c(dst, src, stride, height); break;
1107 }
1108 }
1109
1110 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1111 int i,j;
1112 for (i=0; i < height; i++) {
1113 for (j=0; j < width; j++) {
1114 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1115 }
1116 src += stride;
1117 dst += stride;
1118 }
1119 }
1120
1121 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1122 int i,j;
1123 for (i=0; i < height; i++) {
1124 for (j=0; j < width; j++) {
1125 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1126 }
1127 src += stride;
1128 dst += stride;
1129 }
1130 }
1131
1132 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1133 int i,j;
1134 for (i=0; i < height; i++) {
1135 for (j=0; j < width; j++) {
1136 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1137 }
1138 src += stride;
1139 dst += stride;
1140 }
1141 }
1142
1143 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1144 int i,j;
1145 for (i=0; i < height; i++) {
1146 for (j=0; j < width; j++) {
1147 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1148 }
1149 src += stride;
1150 dst += stride;
1151 }
1152 }
1153
1154 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1155 int i,j;
1156 for (i=0; i < height; i++) {
1157 for (j=0; j < width; j++) {
1158 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1159 }
1160 src += stride;
1161 dst += stride;
1162 }
1163 }
1164
1165 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1166 int i,j;
1167 for (i=0; i < height; i++) {
1168 for (j=0; j < width; j++) {
1169 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1170 }
1171 src += stride;
1172 dst += stride;
1173 }
1174 }
1175
1176 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1177 int i,j;
1178 for (i=0; i < height; i++) {
1179 for (j=0; j < width; j++) {
1180 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1181 }
1182 src += stride;
1183 dst += stride;
1184 }
1185 }
1186
1187 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1188 int i,j;
1189 for (i=0; i < height; i++) {
1190 for (j=0; j < width; j++) {
1191 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1192 }
1193 src += stride;
1194 dst += stride;
1195 }
1196 }
1197
1198 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1199 switch(width){
1200 case 2: avg_pixels2_c (dst, src, stride, height); break;
1201 case 4: avg_pixels4_c (dst, src, stride, height); break;
1202 case 8: avg_pixels8_c (dst, src, stride, height); break;
1203 case 16:avg_pixels16_c(dst, src, stride, height); break;
1204 }
1205 }
1206
1207 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1208 int i,j;
1209 for (i=0; i < height; i++) {
1210 for (j=0; j < width; j++) {
1211 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1212 }
1213 src += stride;
1214 dst += stride;
1215 }
1216 }
1217
1218 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1219 int i,j;
1220 for (i=0; i < height; i++) {
1221 for (j=0; j < width; j++) {
1222 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1223 }
1224 src += stride;
1225 dst += stride;
1226 }
1227 }
1228
1229 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1230 int i,j;
1231 for (i=0; i < height; i++) {
1232 for (j=0; j < width; j++) {
1233 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1234 }
1235 src += stride;
1236 dst += stride;
1237 }
1238 }
1239
1240 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1241 int i,j;
1242 for (i=0; i < height; i++) {
1243 for (j=0; j < width; j++) {
1244 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1245 }
1246 src += stride;
1247 dst += stride;
1248 }
1249 }
1250
1251 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1252 int i,j;
1253 for (i=0; i < height; i++) {
1254 for (j=0; j < width; j++) {
1255 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1256 }
1257 src += stride;
1258 dst += stride;
1259 }
1260 }
1261
1262 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1263 int i,j;
1264 for (i=0; i < height; i++) {
1265 for (j=0; j < width; j++) {
1266 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1267 }
1268 src += stride;
1269 dst += stride;
1270 }
1271 }
1272
1273 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1274 int i,j;
1275 for (i=0; i < height; i++) {
1276 for (j=0; j < width; j++) {
1277 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1278 }
1279 src += stride;
1280 dst += stride;
1281 }
1282 }
1283
1284 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1285 int i,j;
1286 for (i=0; i < height; i++) {
1287 for (j=0; j < width; j++) {
1288 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1289 }
1290 src += stride;
1291 dst += stride;
1292 }
1293 }
1294 #if 0
1295 #define TPEL_WIDTH(width)\
1296 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1297 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1298 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1299 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1300 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1301 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1302 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1303 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1304 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1305 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1306 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1307 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1308 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1309 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1310 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1311 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1312 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1313 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1314 #endif
1315
1316 #define H264_CHROMA_MC(OPNAME, OP)\
1317 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1318 const int A=(8-x)*(8-y);\
1319 const int B=( x)*(8-y);\
1320 const int C=(8-x)*( y);\
1321 const int D=( x)*( y);\
1322 int i;\
1323 \
1324 assert(x<8 && y<8 && x>=0 && y>=0);\
1325 \
1326 for(i=0; i<h; i++)\
1327 {\
1328 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1329 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1330 dst+= stride;\
1331 src+= stride;\
1332 }\
1333 }\
1334 \
1335 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1336 const int A=(8-x)*(8-y);\
1337 const int B=( x)*(8-y);\
1338 const int C=(8-x)*( y);\
1339 const int D=( x)*( y);\
1340 int i;\
1341 \
1342 assert(x<8 && y<8 && x>=0 && y>=0);\
1343 \
1344 for(i=0; i<h; i++)\
1345 {\
1346 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1347 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1348 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1349 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1350 dst+= stride;\
1351 src+= stride;\
1352 }\
1353 }\
1354 \
1355 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1356 const int A=(8-x)*(8-y);\
1357 const int B=( x)*(8-y);\
1358 const int C=(8-x)*( y);\
1359 const int D=( x)*( y);\
1360 int i;\
1361 \
1362 assert(x<8 && y<8 && x>=0 && y>=0);\
1363 \
1364 for(i=0; i<h; i++)\
1365 {\
1366 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1367 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1368 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1369 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1370 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1371 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1372 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1373 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1374 dst+= stride;\
1375 src+= stride;\
1376 }\
1377 }
1378
1379 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1380 #define op_put(a, b) a = (((b) + 32)>>6)
1381
1382 H264_CHROMA_MC(put_ , op_put)
1383 H264_CHROMA_MC(avg_ , op_avg)
1384 #undef op_avg
1385 #undef op_put
1386
1387 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1388 {
1389 int i;
1390 for(i=0; i<h; i++)
1391 {
1392 ST32(dst , LD32(src ));
1393 dst+=dstStride;
1394 src+=srcStride;
1395 }
1396 }
1397
1398 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1399 {
1400 int i;
1401 for(i=0; i<h; i++)
1402 {
1403 ST32(dst , LD32(src ));
1404 ST32(dst+4 , LD32(src+4 ));
1405 dst+=dstStride;
1406 src+=srcStride;
1407 }
1408 }
1409
1410 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1411 {
1412 int i;
1413 for(i=0; i<h; i++)
1414 {
1415 ST32(dst , LD32(src ));
1416 ST32(dst+4 , LD32(src+4 ));
1417 ST32(dst+8 , LD32(src+8 ));
1418 ST32(dst+12, LD32(src+12));
1419 dst+=dstStride;
1420 src+=srcStride;
1421 }
1422 }
1423
1424 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1425 {
1426 int i;
1427 for(i=0; i<h; i++)
1428 {
1429 ST32(dst , LD32(src ));
1430 ST32(dst+4 , LD32(src+4 ));
1431 ST32(dst+8 , LD32(src+8 ));
1432 ST32(dst+12, LD32(src+12));
1433 dst[16]= src[16];
1434 dst+=dstStride;
1435 src+=srcStride;
1436 }
1437 }
1438
1439 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1440 {
1441 int i;
1442 for(i=0; i<h; i++)
1443 {
1444 ST32(dst , LD32(src ));
1445 ST32(dst+4 , LD32(src+4 ));
1446 dst[8]= src[8];
1447 dst+=dstStride;
1448 src+=srcStride;
1449 }
1450 }
1451
1452
1453 #define QPEL_MC(r, OPNAME, RND, OP) \
1454 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1455 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1456 int i;\
1457 for(i=0; i<h; i++)\
1458 {\
1459 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1460 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1461 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1462 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1463 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1464 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1465 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1466 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1467 dst+=dstStride;\
1468 src+=srcStride;\
1469 }\
1470 }\
1471 \
1472 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1473 const int w=8;\
1474 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1475 int i;\
1476 for(i=0; i<w; i++)\
1477 {\
1478 const int src0= src[0*srcStride];\
1479 const int src1= src[1*srcStride];\
1480 const int src2= src[2*srcStride];\
1481 const int src3= src[3*srcStride];\
1482 const int src4= src[4*srcStride];\
1483 const int src5= src[5*srcStride];\
1484 const int src6= src[6*srcStride];\
1485 const int src7= src[7*srcStride];\
1486 const int src8= src[8*srcStride];\
1487 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1488 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1489 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1490 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1491 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1492 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1493 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1494 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1495 dst++;\
1496 src++;\
1497 }\
1498 }\
1499 \
1500 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1501 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1502 int i;\
1503 \
1504 for(i=0; i<h; i++)\
1505 {\
1506 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1507 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1508 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1509 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1510 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1511 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1512 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1513 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1514 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1515 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1516 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1517 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1518 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1519 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1520 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1521 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1522 dst+=dstStride;\
1523 src+=srcStride;\
1524 }\
1525 }\
1526 \
1527 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1528 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1529 int i;\
1530 const int w=16;\
1531 for(i=0; i<w; i++)\
1532 {\
1533 const int src0= src[0*srcStride];\
1534 const int src1= src[1*srcStride];\
1535 const int src2= src[2*srcStride];\
1536 const int src3= src[3*srcStride];\
1537 const int src4= src[4*srcStride];\
1538 const int src5= src[5*srcStride];\
1539 const int src6= src[6*srcStride];\
1540 const int src7= src[7*srcStride];\
1541 const int src8= src[8*srcStride];\
1542 const int src9= src[9*srcStride];\
1543 const int src10= src[10*srcStride];\
1544 const int src11= src[11*srcStride];\
1545 const int src12= src[12*srcStride];\
1546 const int src13= src[13*srcStride];\
1547 const int src14= src[14*srcStride];\
1548 const int src15= src[15*srcStride];\
1549 const int src16= src[16*srcStride];\
1550 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1551 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1552 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1553 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1554 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1555 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1556 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1557 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1558 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1559 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1560 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1561 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1562 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1563 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1564 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1565 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1566 dst++;\
1567 src++;\
1568 }\
1569 }\
1570 \
1571 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1572 OPNAME ## pixels8_c(dst, src, stride, 8);\
1573 }\
1574 \
1575 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1576 uint8_t half[64];\
1577 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1578 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1579 }\
1580 \
1581 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1582 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1583 }\
1584 \
1585 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1586 uint8_t half[64];\
1587 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1588 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1589 }\
1590 \
1591 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1592 uint8_t full[16*9];\
1593 uint8_t half[64];\
1594 copy_block9(full, src, 16, stride, 9);\
1595 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1596 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1597 }\
1598 \
1599 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1600 uint8_t full[16*9];\
1601 copy_block9(full, src, 16, stride, 9);\
1602 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1603 }\
1604 \
1605 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1606 uint8_t full[16*9];\
1607 uint8_t half[64];\
1608 copy_block9(full, src, 16, stride, 9);\
1609 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1610 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1611 }\
1612 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1613 uint8_t full[16*9];\
1614 uint8_t halfH[72];\
1615 uint8_t halfV[64];\
1616 uint8_t halfHV[64];\
1617 copy_block9(full, src, 16, stride, 9);\
1618 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1619 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1620 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1621 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1622 }\
1623 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1624 uint8_t full[16*9];\
1625 uint8_t halfH[72];\
1626 uint8_t halfHV[64];\
1627 copy_block9(full, src, 16, stride, 9);\
1628 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1629 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1630 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1631 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1632 }\
1633 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1634 uint8_t full[16*9];\
1635 uint8_t halfH[72];\
1636 uint8_t halfV[64];\
1637 uint8_t halfHV[64];\
1638 copy_block9(full, src, 16, stride, 9);\
1639 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1640 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1641 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1642 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1643 }\
1644 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1645 uint8_t full[16*9];\
1646 uint8_t halfH[72];\
1647 uint8_t halfHV[64];\
1648 copy_block9(full, src, 16, stride, 9);\
1649 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1650 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1651 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1652 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1653 }\
1654 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1655 uint8_t full[16*9];\
1656 uint8_t halfH[72];\
1657 uint8_t halfV[64];\
1658 uint8_t halfHV[64];\
1659 copy_block9(full, src, 16, stride, 9);\
1660 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1661 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1662 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1663 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1664 }\
1665 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1666 uint8_t full[16*9];\
1667 uint8_t halfH[72];\
1668 uint8_t halfHV[64];\
1669 copy_block9(full, src, 16, stride, 9);\
1670 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1671 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1672 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1673 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1674 }\
1675 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1676 uint8_t full[16*9];\
1677 uint8_t halfH[72];\
1678 uint8_t halfV[64];\
1679 uint8_t halfHV[64];\
1680 copy_block9(full, src, 16, stride, 9);\
1681 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1682 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1683 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1684 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1685 }\
1686 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1687 uint8_t full[16*9];\
1688 uint8_t halfH[72];\
1689 uint8_t halfHV[64];\
1690 copy_block9(full, src, 16, stride, 9);\
1691 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1692 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1693 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1694 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1695 }\
1696 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1697 uint8_t halfH[72];\
1698 uint8_t halfHV[64];\
1699 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1700 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1701 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1702 }\
1703 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1704 uint8_t halfH[72];\
1705 uint8_t halfHV[64];\
1706 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1707 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1708 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1709 }\
1710 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
1712 uint8_t halfH[72];\
1713 uint8_t halfV[64];\
1714 uint8_t halfHV[64];\
1715 copy_block9(full, src, 16, stride, 9);\
1716 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1717 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1718 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1719 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1720 }\
1721 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1722 uint8_t full[16*9];\
1723 uint8_t halfH[72];\
1724 copy_block9(full, src, 16, stride, 9);\
1725 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1726 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1727 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1728 }\
1729 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1730 uint8_t full[16*9];\
1731 uint8_t halfH[72];\
1732 uint8_t halfV[64];\
1733 uint8_t halfHV[64];\
1734 copy_block9(full, src, 16, stride, 9);\
1735 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1736 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1737 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1738 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1739 }\
1740 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1741 uint8_t full[16*9];\
1742 uint8_t halfH[72];\
1743 copy_block9(full, src, 16, stride, 9);\
1744 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1745 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1746 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1747 }\
1748 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1749 uint8_t halfH[72];\
1750 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1751 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1752 }\
1753 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1754 OPNAME ## pixels16_c(dst, src, stride, 16);\
1755 }\
1756 \
1757 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1758 uint8_t half[256];\
1759 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1760 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1761 }\
1762 \
1763 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1764 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1765 }\
1766 \
1767 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1768 uint8_t half[256];\
1769 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1770 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1771 }\
1772 \
1773 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[24*17];\
1775 uint8_t half[256];\
1776 copy_block17(full, src, 24, stride, 17);\
1777 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1778 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1779 }\
1780 \
1781 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1782 uint8_t full[24*17];\
1783 copy_block17(full, src, 24, stride, 17);\
1784 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1785 }\
1786 \
1787 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1788 uint8_t full[24*17];\
1789 uint8_t half[256];\
1790 copy_block17(full, src, 24, stride, 17);\
1791 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1792 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1793 }\
1794 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1795 uint8_t full[24*17];\
1796 uint8_t halfH[272];\
1797 uint8_t halfV[256];\
1798 uint8_t halfHV[256];\
1799 copy_block17(full, src, 24, stride, 17);\
1800 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1801 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1802 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1803 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1804 }\
1805 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1806 uint8_t full[24*17];\
1807 uint8_t halfH[272];\
1808 uint8_t halfHV[256];\
1809 copy_block17(full, src, 24, stride, 17);\
1810 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1811 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1812 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1813 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1814 }\
1815 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1816 uint8_t full[24*17];\
1817 uint8_t halfH[272];\
1818 uint8_t halfV[256];\
1819 uint8_t halfHV[256];\
1820 copy_block17(full, src, 24, stride, 17);\
1821 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1822 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1823 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1824 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1825 }\
1826 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1827 uint8_t full[24*17];\
1828 uint8_t halfH[272];\
1829 uint8_t halfHV[256];\
1830 copy_block17(full, src, 24, stride, 17);\
1831 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1832 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1833 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1834 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1835 }\
1836 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t full[24*17];\
1838 uint8_t halfH[272];\
1839 uint8_t halfV[256];\
1840 uint8_t halfHV[256];\
1841 copy_block17(full, src, 24, stride, 17);\
1842 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1843 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1844 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1845 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1846 }\
1847 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1848 uint8_t full[24*17];\
1849 uint8_t halfH[272];\
1850 uint8_t halfHV[256];\
1851 copy_block17(full, src, 24, stride, 17);\
1852 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1853 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1854 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1855 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1856 }\
1857 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1858 uint8_t full[24*17];\
1859 uint8_t halfH[272];\
1860 uint8_t halfV[256];\
1861 uint8_t halfHV[256];\
1862 copy_block17(full, src, 24, stride, 17);\
1863 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1864 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1865 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1866 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1867 }\
1868 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1869 uint8_t full[24*17];\
1870 uint8_t halfH[272];\
1871 uint8_t halfHV[256];\
1872 copy_block17(full, src, 24, stride, 17);\
1873 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1874 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1875 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1876 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1877 }\
1878 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1879 uint8_t halfH[272];\
1880 uint8_t halfHV[256];\
1881 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1882 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1883 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1884 }\
1885 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1886 uint8_t halfH[272];\
1887 uint8_t halfHV[256];\
1888 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1889 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1890 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1891 }\
1892 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
1894 uint8_t halfH[272];\
1895 uint8_t halfV[256];\
1896 uint8_t halfHV[256];\
1897 copy_block17(full, src, 24, stride, 17);\
1898 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1899 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1900 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1901 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1902 }\
1903 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1904 uint8_t full[24*17];\
1905 uint8_t halfH[272];\
1906 copy_block17(full, src, 24, stride, 17);\
1907 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1908 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1909 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1910 }\
1911 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1912 uint8_t full[24*17];\
1913 uint8_t halfH[272];\
1914 uint8_t halfV[256];\
1915 uint8_t halfHV[256];\
1916 copy_block17(full, src, 24, stride, 17);\
1917 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1918 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1919 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1920 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1921 }\
1922 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1923 uint8_t full[24*17];\
1924 uint8_t halfH[272];\
1925 copy_block17(full, src, 24, stride, 17);\
1926 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1927 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1928 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1929 }\
1930 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1931 uint8_t halfH[272];\
1932 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1933 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1934 }
1935
1936 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1937 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1938 #define op_put(a, b) a = cm[((b) + 16)>>5]
1939 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1940
1941 QPEL_MC(0, put_ , _ , op_put)
1942 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1943 QPEL_MC(0, avg_ , _ , op_avg)
1944 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1945 #undef op_avg
1946 #undef op_avg_no_rnd
1947 #undef op_put
1948 #undef op_put_no_rnd
1949
1950 #if 1
1951 #define H264_LOWPASS(OPNAME, OP, OP2) \
1952 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1953 const int h=4;\
1954 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1955 int i;\
1956 for(i=0; i<h; i++)\
1957 {\
1958 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1959 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1960 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1961 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1962 dst+=dstStride;\
1963 src+=srcStride;\
1964 }\
1965 }\
1966 \
1967 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1968 const int w=4;\
1969 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1970 int i;\
1971 for(i=0; i<w; i++)\
1972 {\
1973 const int srcB= src[-2*srcStride];\
1974 const int srcA= src[-1*srcStride];\
1975 const int src0= src[0 *srcStride];\
1976 const int src1= src[1 *srcStride];\
1977 const int src2= src[2 *srcStride];\
1978 const int src3= src[3 *srcStride];\
1979 const int src4= src[4 *srcStride];\
1980 const int src5= src[5 *srcStride];\
1981 const int src6= src[6 *srcStride];\
1982 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1983 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1984 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1985 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1986 dst++;\
1987 src++;\
1988 }\
1989 }\
1990 \
1991 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1992 const int h=4;\
1993 const int w=4;\
1994 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1995 int i;\
1996 src -= 2*srcStride;\
1997 for(i=0; i<h+5; i++)\
1998 {\
1999 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2000 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2001 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2002 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2003 tmp+=tmpStride;\
2004 src+=srcStride;\
2005 }\
2006 tmp -= tmpStride*(h+5-2);\
2007 for(i=0; i<w; i++)\
2008 {\
2009 const int tmpB= tmp[-2*tmpStride];\
2010 const int tmpA= tmp[-1*tmpStride];\
2011 const int tmp0= tmp[0 *tmpStride];\
2012 const int tmp1= tmp[1 *tmpStride];\
2013 const int tmp2= tmp[2 *tmpStride];\
2014 const int tmp3= tmp[3 *tmpStride];\
2015 const int tmp4= tmp[4 *tmpStride];\
2016 const int tmp5= tmp[5 *tmpStride];\
2017 const int tmp6= tmp[6 *tmpStride];\
2018 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2019 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2020 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2021 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2022 dst++;\
2023 tmp++;\
2024 }\
2025 }\
2026 \
2027 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2028 const int h=8;\
2029 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2030 int i;\
2031 for(i=0; i<h; i++)\
2032 {\
2033 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2034 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2035 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2036 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2037 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2038 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2039 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2040 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2041 dst+=dstStride;\
2042 src+=srcStride;\
2043 }\
2044 }\
2045 \
2046 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2047 const int w=8;\
2048 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2049 int i;\
2050 for(i=0; i<w; i++)\
2051 {\
2052 const int srcB= src[-2*srcStride];\
2053 const int srcA= src[-1*srcStride];\
2054 const int src0= src[0 *srcStride];\
2055 const int src1= src[1 *srcStride];\
2056 const int src2= src[2 *srcStride];\
2057 const int src3= src[3 *srcStride];\
2058 const int src4= src[4 *srcStride];\
2059 const int src5= src[5 *srcStride];\
2060 const int src6= src[6 *srcStride];\
2061 const int src7= src[7 *srcStride];\
2062 const int src8= src[8 *srcStride];\
2063 const int src9= src[9 *srcStride];\
2064 const int src10=src[10*srcStride];\
2065 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2066 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2067 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2068 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2069 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2070 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2071 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2072 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2073 dst++;\
2074 src++;\
2075 }\
2076 }\
2077 \
2078 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2079 const int h=8;\
2080 const int w=8;\
2081 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2082 int i;\
2083 src -= 2*srcStride;\
2084 for(i=0; i<h+5; i++)\
2085 {\
2086 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2087 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2088 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2089 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2090 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2091 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2092 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2093 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2094 tmp+=tmpStride;\
2095 src+=srcStride;\
2096 }\
2097 tmp -= tmpStride*(h+5-2);\
2098 for(i=0; i<w; i++)\
2099 {\
2100 const int tmpB= tmp[-2*tmpStride];\
2101 const int tmpA= tmp[-1*tmpStride];\
2102 const int tmp0= tmp[0 *tmpStride];\
2103 const int tmp1= tmp[1 *tmpStride];\
2104 const int tmp2= tmp[2 *tmpStride];\
2105 const int tmp3= tmp[3 *tmpStride];\
2106 const int tmp4= tmp[4 *tmpStride];\
2107 const int tmp5= tmp[5 *tmpStride];\
2108 const int tmp6= tmp[6 *tmpStride];\
2109 const int tmp7= tmp[7 *tmpStride];\
2110 const int tmp8= tmp[8 *tmpStride];\
2111 const int tmp9= tmp[9 *tmpStride];\
2112 const int tmp10=tmp[10*tmpStride];\
2113 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2114 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2115 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2116 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2117 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2118 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2119 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2120 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2121 dst++;\
2122 tmp++;\
2123 }\
2124 }\
2125 \
2126 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2127 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2128 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2129 src += 8*srcStride;\
2130 dst += 8*dstStride;\
2131 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2132 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2133 }\
2134 \
2135 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2136 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2137 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2138 src += 8*srcStride;\
2139 dst += 8*dstStride;\
2140 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2141 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2142 }\
2143 \
2144 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2145 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2146 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2147 src += 8*srcStride;\
2148 dst += 8*dstStride;\
2149 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2150 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2151 }\
2152
2153 #define H264_MC(OPNAME, SIZE) \
2154 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2155 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2156 }\
2157 \
2158 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2159 uint8_t half[SIZE*SIZE];\
2160 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2161 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2162 }\
2163 \
2164 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2165 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2166 }\
2167 \
2168 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2169 uint8_t half[SIZE*SIZE];\
2170 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2171 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2172 }\
2173 \
2174 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2175 uint8_t full[SIZE*(SIZE+5)];\
2176 uint8_t * const full_mid= full + SIZE*2;\
2177 uint8_t half[SIZE*SIZE];\
2178 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2179 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2180 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2181 }\
2182 \
2183 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2184 uint8_t full[SIZE*(SIZE+5)];\
2185 uint8_t * const full_mid= full + SIZE*2;\
2186 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2187 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2188 }\
2189 \
2190 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2191 uint8_t full[SIZE*(SIZE+5)];\
2192 uint8_t * const full_mid= full + SIZE*2;\
2193 uint8_t half[SIZE*SIZE];\
2194 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2195 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2196 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2197 }\
2198 \
2199 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2200 uint8_t full[SIZE*(SIZE+5)];\
2201 uint8_t * const full_mid= full + SIZE*2;\
2202 uint8_t halfH[SIZE*SIZE];\
2203 uint8_t halfV[SIZE*SIZE];\
2204 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2205 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2206 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2207 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2208 }\
2209 \
2210 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2211 uint8_t full[SIZE*(SIZE+5)];\
2212 uint8_t * const full_mid= full + SIZE*2;\
2213 uint8_t halfH[SIZE*SIZE];\
2214 uint8_t halfV[SIZE*SIZE];\
2215 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2216 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2217 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2218 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2219 }\
2220 \
2221 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2222 uint8_t full[SIZE*(SIZE+5)];\
2223 uint8_t * const full_mid= full + SIZE*2;\
2224 uint8_t halfH[SIZE*SIZE];\
2225 uint8_t halfV[SIZE*SIZE];\
2226 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2227 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2228 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2229 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2230 }\
2231 \
2232 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2233 uint8_t full[SIZE*(SIZE+5)];\
2234 uint8_t * const full_mid= full + SIZE*2;\
2235 uint8_t halfH[SIZE*SIZE];\
2236 uint8_t halfV[SIZE*SIZE];\
2237 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2238 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2239 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2240 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2241 }\
2242 \
2243 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2244 int16_t tmp[SIZE*(SIZE+5)];\
2245 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2246 }\
2247 \
2248 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2249 int16_t tmp[SIZE*(SIZE+5)];\
2250 uint8_t halfH[SIZE*SIZE];\
2251 uint8_t halfHV[SIZE*SIZE];\
2252 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2253 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2254 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2255 }\
2256 \
2257 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2258 int16_t tmp[SIZE*(SIZE+5)];\
2259 uint8_t halfH[SIZE*SIZE];\
2260 uint8_t halfHV[SIZE*SIZE];\
2261 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2262 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2263 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2264 }\
2265 \
2266 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2267 uint8_t full[SIZE*(SIZE+5)];\
2268 uint8_t * const full_mid= full + SIZE*2;\
2269 int16_t tmp[SIZE*(SIZE+5)];\
2270 uint8_t halfV[SIZE*SIZE];\
2271 uint8_t halfHV[SIZE*SIZE];\
2272 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2273 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2274 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2275 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2276 }\
2277 \
2278 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2279 uint8_t full[SIZE*(SIZE+5)];\
2280 uint8_t * const full_mid= full + SIZE*2;\
2281 int16_t tmp[SIZE*(SIZE+5)];\
2282 uint8_t halfV[SIZE*SIZE];\
2283 uint8_t halfHV[SIZE*SIZE];\
2284 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2285 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2286 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2287 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2288 }\
2289
2290 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2291 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2292 #define op_put(a, b) a = cm[((b) + 16)>>5]
2293 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2294 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2295
2296 H264_LOWPASS(put_ , op_put, op2_put)
2297 H264_LOWPASS(avg_ , op_avg, op2_avg)
2298 H264_MC(put_, 4)
2299 H264_MC(put_, 8)
2300 H264_MC(put_, 16)
2301 H264_MC(avg_, 4)
2302 H264_MC(avg_, 8)
2303 H264_MC(avg_, 16)
2304
2305 #undef op_avg
2306 #undef op_put
2307 #undef op2_avg
2308 #undef op2_put
2309 #endif
2310
2311 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2312 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2313 int i;
2314
2315 for(i=0; i<h; i++){
2316 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2317 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2318 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2319 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2320 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2321 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2322 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2323 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2324 dst+=dstStride;
2325 src+=srcStride;
2326 }
2327 }
2328
2329 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2330 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2331 int i;
2332
2333 for(i=0; i<w; i++){
2334 const int src_1= src[ -srcStride];
2335 const int src0 = src[0 ];
2336 const int src1 = src[ srcStride];
2337 const int src2 = src[2*srcStride];
2338 const int src3 = src[3*srcStride];
2339 const int src4 = src[4*srcStride];
2340 const int src5 = src[5*srcStride];
2341 const int src6 = src[6*srcStride];
2342 const int src7 = src[7*srcStride];
2343 const int src8 = src[8*srcStride];
2344 const int src9 = src[9*srcStride];
2345 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2346 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2347 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2348 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2349 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2350 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2351 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2352 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2353 src++;
2354 dst++;
2355 }
2356 }
2357
2358 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2359 put_pixels8_c(dst, src, stride, 8);
2360 }
2361
2362 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2363 uint8_t half[64];
2364 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2365 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2366 }
2367
2368 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2369 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2370 }
2371
2372 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2373 uint8_t half[64];
2374 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2375 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2376 }
2377
2378 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2379 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2380 }
2381
2382 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2383 uint8_t halfH[88];
2384 uint8_t halfV[64];
2385 uint8_t halfHV[64];
2386 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2387 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2388 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2389 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2390 }
2391 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2392 uint8_t halfH[88];
2393 uint8_t halfV[64];
2394 uint8_t halfHV[64];
2395 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2396 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2397 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2398 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2399 }
2400 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2401 uint8_t halfH[88];
2402 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2403 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2404 }
2405
2406 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2407 int x;
2408 const int strength= ff_h263_loop_filter_strength[qscale];
2409
2410 for(x=0; x<8; x++){
2411 int d1, d2, ad1;
2412 int p0= src[x-2*stride];
2413 int p1= src[x-1*stride];
2414 int p2= src[x+0*stride];
2415 int p3= src[x+1*stride];
2416 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2417
2418 if (d<-2*strength) d1= 0;
2419 else if(d<- strength) d1=-2*strength - d;
2420 else if(d< strength) d1= d;
2421 else if(d< 2*strength) d1= 2*strength - d;
2422 else d1= 0;
2423
2424 p1 += d1;
2425 p2 -= d1;
2426 if(p1&256) p1= ~(p1>>31);
2427 if(p2&256) p2= ~(p2>>31);
2428
2429 src[x-1*stride] = p1;
2430 src[x+0*stride] = p2;
2431
2432 ad1= ABS(d1)>>1;
2433
2434 d2= clip((p0-p3)/4, -ad1, ad1);
2435
2436 src[x-2*stride] = p0 - d2;
2437 src[x+ stride] = p3 + d2;
2438 }
2439 }
2440
2441 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2442 int y;
2443 const int strength= ff_h263_loop_filter_strength[qscale];
2444
2445 for(y=0; y<8; y++){
2446 int d1, d2, ad1;
2447 int p0= src[y*stride-2];
2448 int p1= src[y*stride-1];
2449 int p2= src[y*stride+0];
2450 int p3= src[y*stride+1];
2451 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2452
2453 if (d<-2*strength) d1= 0;
2454 else if(d<- strength) d1=-2*strength - d;
2455 else if(d< strength) d1= d;
2456 else if(d< 2*strength) d1= 2*strength - d;
2457 else d1= 0;
2458
2459 p1 += d1;
2460 p2 -= d1;
2461 if(p1&256) p1= ~(p1>>31);
2462 if(p2&256) p2= ~(p2>>31);
2463
2464 src[y*stride-1] = p1;
2465 src[y*stride+0] = p2;
2466
2467 ad1= ABS(d1)>>1;
2468
2469 d2= clip((p0-p3)/4, -ad1, ad1);
2470
2471 src[y*stride-2] = p0 - d2;
2472 src[y*stride+1] = p3 + d2;
2473 }
2474 }
2475
2476 static void h261_loop_filter_c(uint8_t *src, int stride){
2477 int x,y,xy,yz;
2478 int temp[64];
2479
2480 for(x=0; x<8; x++){
2481 temp[x ] = 4*src[x ];
2482 temp[x + 7*8] = 4*src[x + 7*stride];
2483 }
2484 for(y=1; y<7; y++){
2485 for(x=0; x<8; x++){
2486 xy = y * stride + x;
2487 yz = y * 8 + x;
2488 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2489 }
2490 }
2491
2492 for(y=0; y<8; y++){
2493 src[ y*stride] = (temp[ y*8] + 2)>>2;
2494 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2495 for(x=1; x<7; x++){
2496 xy = y * stride + x;
2497 yz = y * 8 + x;
2498 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2499 }
2500 }
2501 }
2502
2503 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2504 {
2505 int s, i;
2506
2507 s = 0;
2508 for(i=0;i<h;i++) {
2509 s += abs(pix1[0] - pix2[0]);
2510 s += abs(pix1[1] - pix2[1]);
2511 s += abs(pix1[2] - pix2[2]);
2512 s += abs(pix1[3] - pix2[3]);
2513 s += abs(pix1[4] - pix2[4]);
2514 s += abs(pix1[5] - pix2[5]);
2515 s += abs(pix1[6] - pix2[6]);
2516 s += abs(pix1[7] - pix2[7]);
2517 s += abs(pix1[8] - pix2[8]);
2518 s += abs(pix1[9] - pix2[9]);
2519 s += abs(pix1[10] - pix2[10]);
2520 s += abs(pix1[11] - pix2[11]);
2521 s += abs(pix1[12] - pix2[12]);
2522 s += abs(pix1[13] - pix2[13]);
2523 s += abs(pix1[14] - pix2[14]);
2524 s += abs(pix1[15] - pix2[15]);
2525 pix1 += line_size;
2526 pix2 += line_size;
2527 }
2528 return s;
2529 }
2530
2531 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2532 {
2533 int s, i;
2534
2535 s = 0;
2536 for(i=0;i<h;i++) {
2537 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2538 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2539 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2540 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2541 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2542 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2543 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2544 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2545 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2546 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2547 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2548 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2549 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2550 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2551 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2552 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2553 pix1 += line_size;
2554 pix2 += line_size;
2555 }
2556 return s;
2557 }
2558
2559 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2560 {
2561 int s, i;
2562 uint8_t *pix3 = pix2 + line_size;
2563
2564 s = 0;
2565 for(i=0;i<h;i++) {
2566 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2567 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2568 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2569 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2570 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2571 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2572 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2573 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2574 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2575 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2576 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2577 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2578 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2579 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2580 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2581 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2582 pix1 += line_size;
2583 pix2 += line_size;
2584 pix3 += line_size;
2585 }
2586 return s;
2587 }
2588
2589 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2590 {
2591 int s, i;
2592 uint8_t *pix3 = pix2 + line_size;
2593
2594 s = 0;
2595 for(i=0;i<h;i++) {
2596 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2597 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2598 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2599 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2600 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2601 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2602 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2603 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2604 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2605 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2606 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2607 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2608 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2609 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2610 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2611 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2612 pix1 += line_size;
2613 pix2 += line_size;
2614 pix3 += line_size;
2615 }
2616 return s;
2617 }
2618
2619 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2620 {
2621 int s, i;
2622
2623 s = 0;
2624 for(i=0;i<h;i++) {
2625 s += abs(pix1[0] - pix2[0]);
2626 s += abs(pix1[1] - pix2[1]);
2627 s += abs(pix1[2] - pix2[2]);
2628 s += abs(pix1[3] - pix2[3]);
2629 s += abs(pix1[4] - pix2[4]);
2630 s += abs(pix1[5] - pix2[5]);
2631 s += abs(pix1[6] - pix2[6]);
2632 s += abs(pix1[7] - pix2[7]);
2633 pix1 += line_size;
2634 pix2 += line_size;
2635 }
2636 return s;
2637 }
2638
2639 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2640 {
2641 int s, i;
2642
2643 s = 0;
2644 for(i=0;i<h;i++) {
2645 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2646 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2647 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2648 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2649 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2650 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2651 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2652 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2653 pix1 += line_size;
2654 pix2 += line_size;
2655 }
2656 return s;
2657 }
2658
2659 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2660 {
2661 int s, i;
2662 uint8_t *pix3 = pix2 + line_size;
2663
2664 s = 0;
2665 for(i=0;i<h;i++) {
2666 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2667 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2668 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2669 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2670 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2671 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2672 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2673 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2674 pix1 += line_size;
2675 pix2 += line_size;
2676 pix3 += line_size;
2677 }
2678 return s;
2679 }
2680
2681 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2682 {
2683 int s, i;
2684 uint8_t *pix3 = pix2 + line_size;
2685
2686 s = 0;
2687 for(i=0;i<h;i++) {
2688 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2689 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2690 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2691 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2692 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2693 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2694 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2695 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2696 pix1 += line_size;
2697 pix2 += line_size;
2698 pix3 += line_size;
2699 }
2700 return s;
2701 }
2702
2703 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2704 int score1=0;
2705 int score2=0;
2706 int x,y;
2707
2708 for(y=0; y<h; y++){
2709 for(x=0; x<16; x++){
2710 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2711 }
2712 if(y+1<h){
2713 for(x=0; x<15; x++){
2714 score2+= ABS( s1[x ] - s1[x +stride]
2715 - s1[x+1] + s1[x+1+stride])
2716 -ABS( s2[x ] - s2[x +stride]
2717 - s2[x+1] + s2[x+1+stride]);
2718 }
2719 }
2720 s1+= stride;
2721 s2+= stride;
2722 }
2723
2724 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2725 else return score1 + ABS(score2)*8;
2726 }
2727
2728 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2729 int score1=0;
2730 int score2=0;
2731 int x,y;
2732
2733 for(y=0; y<h; y++){
2734 for(x=0; x<8; x++){
2735 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2736 }
2737 if(y+1<h){
2738 for(x=0; x<7; x++){
2739 score2+= ABS( s1[x ] - s1[x +stride]
2740 - s1[x+1] + s1[x+1+stride])
2741 -ABS( s2[x ] - s2[x +stride]
2742 - s2[x+1] + s2[x+1+stride]);
2743 }
2744 }
2745 s1+= stride;
2746 s2+= stride;
2747 }
2748
2749 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2750 else return score1 + ABS(score2)*8;
2751 }
2752
2753 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2754 int i;
2755 unsigned int sum=0;
2756
2757 for(i=0; i<8*8; i++){
2758 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2759 int w= weight[i];
2760 b>>= RECON_SHIFT;
2761 assert(-512<b && b<512);
2762
2763 sum += (w*b)*(w*b)>>4;
2764 }
2765 return sum>>2;
2766 }
2767
2768 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2769 int i;
2770
2771 for(i=0; i<8*8; i++){
2772 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2773 }
2774 }
2775
2776 /**
2777 * permutes an 8x8 block.
2778 * @param block the block which will be permuted according to the given permutation vector
2779 * @param permutation the permutation vector
2780 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2781 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2782 * (inverse) permutated to scantable order!
2783 */
2784 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2785 {
2786 int i;
2787 DCTELEM temp[64];
2788
2789 if(last<=0) return;
2790 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2791
2792 for(i=0; i<=last; i++){
2793 const int j= scantable[i];
2794 temp[j]= block[j];
2795 block[j]=0;
2796 }
2797
2798 for(i=0; i<=last; i++){
2799 const int j= scantable[i];
2800 const int perm_j= permutation[j];
2801 block[perm_j]= temp[j];
2802 }
2803 }
2804
2805 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2806 return 0;
2807 }
2808
2809 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2810 int i;
2811
2812 memset(cmp, 0, sizeof(void*)*5);
2813
2814 for(i=0; i<5; i++){
2815 switch(type&0xFF){
2816 case FF_CMP_SAD:
2817 cmp[i]= c->sad[i];
2818 break;
2819 case FF_CMP_SATD:
2820 cmp[i]= c->hadamard8_diff[i];
2821 break;
2822 case FF_CMP_SSE:
2823 cmp[i]= c->sse[i];
2824 break;
2825 case FF_CMP_DCT:
2826 cmp[i]= c->dct_sad[i];
2827 break;
2828 case FF_CMP_PSNR:
2829 cmp[i]= c->quant_psnr[i];
2830 break;
2831 case FF_CMP_BIT:
2832 cmp[i]= c->bit[i];
2833 break;
2834 case FF_CMP_RD:
2835 cmp[i]= c->rd[i];
2836 break;
2837 case FF_CMP_VSAD:
2838 cmp[i]= c->vsad[i];
2839 break;
2840 case FF_CMP_VSSE:
2841 cmp[i]= c->vsse[i];
2842 break;
2843 case FF_CMP_ZERO:
2844 cmp[i]= zero_cmp;
2845 break;
2846 case FF_CMP_NSSE:
2847 cmp[i]= c->nsse[i];
2848 break;
2849 case FF_CMP_W53:
2850 cmp[i]= c->w53[i];
2851 break;
2852 case FF_CMP_W97:
2853 cmp[i]= c->w97[i];
2854 break;
2855 default:
2856 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2857 }
2858 }
2859 }
2860
2861 /**
2862 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2863 */
2864 static void clear_blocks_c(DCTELEM *blocks)
2865 {
2866 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2867 }
2868
2869 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2870 int i;
2871 for(i=0; i+7<w; i+=8){
2872 dst[i+0] += src[i+0];
2873 dst[i+1] += src[i+1];
2874 dst[i+2] += src[i+2];
2875 dst[i+3] += src[i+3];
2876 dst[i+4] += src[i+4];
2877 dst[i+5] += src[i+5];
2878 dst[i+6] += src[i+6];
2879 dst[i+7] += src[i+7];
2880 }
2881 for(; i<w; i++)
2882 dst[i+0] += src[i+0];
2883 }
2884
2885 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2886 int i;
2887 for(i=0; i+7<w; i+=8){
2888 dst[i+0] = src1[i+0]-src2[i+0];
2889 dst[i+1] = src1[i+1]-src2[i+1];
2890 dst[i+2] = src1[i+2]-src2[i+2];
2891 dst[i+3] = src1[i+3]-src2[i+3];
2892 dst[i+4] = src1[i+4]-src2[i+4];
2893 dst[i+5] = src1[i+5]-src2[i+5];
2894 dst[i+6] = src1[i+6]-src2[i+6];
2895 dst[i+7] = src1[i+7]-src2[i+7];
2896 }
2897 for(; i<w; i++)
2898 dst[i+0] = src1[i+0]-src2[i+0];
2899 }
2900
2901 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2902 int i;
2903 uint8_t l, lt;
2904
2905 l= *left;
2906 lt= *left_top;
2907
2908 for(i=0; i<w; i++){
2909 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2910 lt= src1[i];
2911 l= src2[i];
2912 dst[i]= l - pred;
2913 }
2914
2915 *left= l;
2916 *left_top= lt;
2917 }
2918
2919 #define BUTTERFLY2(o1,o2,i1,i2) \
2920 o1= (i1)+(i2);\
2921 o2= (i1)-(i2);
2922
2923 #define BUTTERFLY1(x,y) \
2924 {\
2925 int a,b;\
2926 a= x;\
2927 b= y;\
2928 x= a+b;\
2929 y= a-b;\
2930 }
2931
2932 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2933
2934 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2935 int i;
2936 int temp[64];
2937 int sum=0;
2938
2939 assert(h==8);
2940
2941 for(i=0; i<8; i++){
2942 //FIXME try pointer walks
2943 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2944 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2945 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2946 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2947
2948 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2949 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2950 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2951 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2952
2953 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2954 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2955 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2956 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2957 }
2958
2959 for(i=0; i<8; i++){
2960 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2961 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2962 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2963 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2964
2965 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2966 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2967 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2968 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2969
2970 sum +=
2971 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2972 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2973 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2974 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2975 }
2976 #if 0
2977 static int maxi=0;
2978 if(sum>maxi){
2979 maxi=sum;
2980 printf("MAX:%d\n", maxi);
2981 }
2982 #endif
2983 return sum;
2984 }
2985
2986 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2987 int i;
2988 int temp[64];
2989 int sum=0;
2990
2991 assert(h==8);
2992
2993 for(i=0; i<8; i++){
2994 //FIXME try pointer walks
2995 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2996 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2997 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2998 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2999
3000 BUTTERFLY1(temp