H.264 weighted prediction.
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
32 #include "faandct.h"
33
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
35 uint32_t squareTbl[512] = {0, };
36
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46 };
47
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59 };
60
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
63
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73 };
74
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84 };
85
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120 };
121
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 };
133
134 static int pix_sum_c(uint8_t * pix, int line_size)
135 {
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154 }
155
156 static int pix_norm1_c(uint8_t * pix, int line_size)
157 {
158 int s, i, j;
159 uint32_t *sq = squareTbl + 256;
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
164 #if 0
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
173 #else
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184 #else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195 #endif
196 #endif
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202 }
203
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220 }
221
222 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223 {
224 int s, i;
225 uint32_t *sq = squareTbl + 256;
226
227 s = 0;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 pix1 += line_size;
234 pix2 += line_size;
235 }
236 return s;
237 }
238
239 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
240 {
241 int s, i;
242 uint32_t *sq = squareTbl + 256;
243
244 s = 0;
245 for (i = 0; i < h; i++) {
246 s += sq[pix1[0] - pix2[0]];
247 s += sq[pix1[1] - pix2[1]];
248 s += sq[pix1[2] - pix2[2]];
249 s += sq[pix1[3] - pix2[3]];
250 s += sq[pix1[4] - pix2[4]];
251 s += sq[pix1[5] - pix2[5]];
252 s += sq[pix1[6] - pix2[6]];
253 s += sq[pix1[7] - pix2[7]];
254 pix1 += line_size;
255 pix2 += line_size;
256 }
257 return s;
258 }
259
260 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
261 {
262 int s, i;
263 uint32_t *sq = squareTbl + 256;
264
265 s = 0;
266 for (i = 0; i < h; i++) {
267 s += sq[pix1[ 0] - pix2[ 0]];
268 s += sq[pix1[ 1] - pix2[ 1]];
269 s += sq[pix1[ 2] - pix2[ 2]];
270 s += sq[pix1[ 3] - pix2[ 3]];
271 s += sq[pix1[ 4] - pix2[ 4]];
272 s += sq[pix1[ 5] - pix2[ 5]];
273 s += sq[pix1[ 6] - pix2[ 6]];
274 s += sq[pix1[ 7] - pix2[ 7]];
275 s += sq[pix1[ 8] - pix2[ 8]];
276 s += sq[pix1[ 9] - pix2[ 9]];
277 s += sq[pix1[10] - pix2[10]];
278 s += sq[pix1[11] - pix2[11]];
279 s += sq[pix1[12] - pix2[12]];
280 s += sq[pix1[13] - pix2[13]];
281 s += sq[pix1[14] - pix2[14]];
282 s += sq[pix1[15] - pix2[15]];
283
284 pix1 += line_size;
285 pix2 += line_size;
286 }
287 return s;
288 }
289
290
291 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
292 int s, i, j;
293 const int dec_count= w==8 ? 3 : 4;
294 int tmp[16*16];
295 #if 0
296 int level, ori;
297 static const int scale[2][2][4][4]={
298 {
299 {
300 //8x8 dec=3
301 {268, 239, 239, 213},
302 { 0, 224, 224, 152},
303 { 0, 135, 135, 110},
304 },{
305 //16x16 dec=4
306 {344, 310, 310, 280},
307 { 0, 320, 320, 228},
308 { 0, 175, 175, 136},
309 { 0, 129, 129, 102},
310 }
311 },{
312 {//FIXME 5/3
313 //8x8 dec=3
314 {275, 245, 245, 218},
315 { 0, 230, 230, 156},
316 { 0, 138, 138, 113},
317 },{
318 //16x16 dec=4
319 {352, 317, 317, 286},
320 { 0, 328, 328, 233},
321 { 0, 180, 180, 140},
322 { 0, 132, 132, 105},
323 }
324 }
325 };
326 #endif
327
328 for (i = 0; i < h; i++) {
329 for (j = 0; j < w; j+=4) {
330 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
331 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
332 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
333 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
334 }
335 pix1 += line_size;
336 pix2 += line_size;
337 }
338 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
339
340 s=0;
341 #if 0
342 for(level=0; level<dec_count; level++){
343 for(ori= level ? 1 : 0; ori<4; ori++){
344 int sx= (ori&1) ? 1<<level: 0;
345 int stride= 16<<(dec_count-level);
346 int sy= (ori&2) ? stride>>1 : 0;
347 int size= 1<<level;
348
349 for(i=0; i<size; i++){
350 for(j=0; j<size; j++){
351 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
352 s += ABS(v);
353 }
354 }
355 }
356 }
357 #endif
358 for (i = 0; i < h; i++) {
359 for (j = 0; j < w; j+=4) {
360 s+= ABS(tmp[16*i+j+0]);
361 s+= ABS(tmp[16*i+j+1]);
362 s+= ABS(tmp[16*i+j+2]);
363 s+= ABS(tmp[16*i+j+3]);
364 }
365 }
366 assert(s>=0);
367
368 return s>>2;
369 }
370
371 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372 return w_c(v, pix1, pix2, line_size, 8, h, 1);
373 }
374
375 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376 return w_c(v, pix1, pix2, line_size, 8, h, 0);
377 }
378
379 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380 return w_c(v, pix1, pix2, line_size, 16, h, 1);
381 }
382
383 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384 return w_c(v, pix1, pix2, line_size, 16, h, 0);
385 }
386
387 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
388 {
389 int i;
390
391 /* read the pixels */
392 for(i=0;i<8;i++) {
393 block[0] = pixels[0];
394 block[1] = pixels[1];
395 block[2] = pixels[2];
396 block[3] = pixels[3];
397 block[4] = pixels[4];
398 block[5] = pixels[5];
399 block[6] = pixels[6];
400 block[7] = pixels[7];
401 pixels += line_size;
402 block += 8;
403 }
404 }
405
406 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
407 const uint8_t *s2, int stride){
408 int i;
409
410 /* read the pixels */
411 for(i=0;i<8;i++) {
412 block[0] = s1[0] - s2[0];
413 block[1] = s1[1] - s2[1];
414 block[2] = s1[2] - s2[2];
415 block[3] = s1[3] - s2[3];
416 block[4] = s1[4] - s2[4];
417 block[5] = s1[5] - s2[5];
418 block[6] = s1[6] - s2[6];
419 block[7] = s1[7] - s2[7];
420 s1 += stride;
421 s2 += stride;
422 block += 8;
423 }
424 }
425
426
427 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
428 int line_size)
429 {
430 int i;
431 uint8_t *cm = cropTbl + MAX_NEG_CROP;
432
433 /* read the pixels */
434 for(i=0;i<8;i++) {
435 pixels[0] = cm[block[0]];
436 pixels[1] = cm[block[1]];
437 pixels[2] = cm[block[2]];
438 pixels[3] = cm[block[3]];
439 pixels[4] = cm[block[4]];
440 pixels[5] = cm[block[5]];
441 pixels[6] = cm[block[6]];
442 pixels[7] = cm[block[7]];
443
444 pixels += line_size;
445 block += 8;
446 }
447 }
448
449 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
450 int line_size)
451 {
452 int i;
453 uint8_t *cm = cropTbl + MAX_NEG_CROP;
454
455 /* read the pixels */
456 for(i=0;i<4;i++) {
457 pixels[0] = cm[block[0]];
458 pixels[1] = cm[block[1]];
459 pixels[2] = cm[block[2]];
460 pixels[3] = cm[block[3]];
461
462 pixels += line_size;
463 block += 8;
464 }
465 }
466
467 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
468 int line_size)
469 {
470 int i;
471 uint8_t *cm = cropTbl + MAX_NEG_CROP;
472
473 /* read the pixels */
474 for(i=0;i<2;i++) {
475 pixels[0] = cm[block[0]];
476 pixels[1] = cm[block[1]];
477
478 pixels += line_size;
479 block += 8;
480 }
481 }
482
483 static void put_signed_pixels_clamped_c(const DCTELEM *block,
484 uint8_t *restrict pixels,
485 int line_size)
486 {
487 int i, j;
488
489 for (i = 0; i < 8; i++) {
490 for (j = 0; j < 8; j++) {
491 if (*block < -128)
492 *pixels = 0;
493 else if (*block > 127)
494 *pixels = 255;
495 else
496 *pixels = (uint8_t)(*block + 128);
497 block++;
498 pixels++;
499 }
500 pixels += (line_size - 8);
501 }
502 }
503
504 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
505 int line_size)
506 {
507 int i;
508 uint8_t *cm = cropTbl + MAX_NEG_CROP;
509
510 /* read the pixels */
511 for(i=0;i<8;i++) {
512 pixels[0] = cm[pixels[0] + block[0]];
513 pixels[1] = cm[pixels[1] + block[1]];
514 pixels[2] = cm[pixels[2] + block[2]];
515 pixels[3] = cm[pixels[3] + block[3]];
516 pixels[4] = cm[pixels[4] + block[4]];
517 pixels[5] = cm[pixels[5] + block[5]];
518 pixels[6] = cm[pixels[6] + block[6]];
519 pixels[7] = cm[pixels[7] + block[7]];
520 pixels += line_size;
521 block += 8;
522 }
523 }
524
525 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
526 int line_size)
527 {
528 int i;
529 uint8_t *cm = cropTbl + MAX_NEG_CROP;
530
531 /* read the pixels */
532 for(i=0;i<4;i++) {
533 pixels[0] = cm[pixels[0] + block[0]];
534 pixels[1] = cm[pixels[1] + block[1]];
535 pixels[2] = cm[pixels[2] + block[2]];
536 pixels[3] = cm[pixels[3] + block[3]];
537 pixels += line_size;
538 block += 8;
539 }
540 }
541
542 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
543 int line_size)
544 {
545 int i;
546 uint8_t *cm = cropTbl + MAX_NEG_CROP;
547
548 /* read the pixels */
549 for(i=0;i<2;i++) {
550 pixels[0] = cm[pixels[0] + block[0]];
551 pixels[1] = cm[pixels[1] + block[1]];
552 pixels += line_size;
553 block += 8;
554 }
555 }
556 #if 0
557
558 #define PIXOP2(OPNAME, OP) \
559 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
560 {\
561 int i;\
562 for(i=0; i<h; i++){\
563 OP(*((uint64_t*)block), LD64(pixels));\
564 pixels+=line_size;\
565 block +=line_size;\
566 }\
567 }\
568 \
569 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
570 {\
571 int i;\
572 for(i=0; i<h; i++){\
573 const uint64_t a= LD64(pixels );\
574 const uint64_t b= LD64(pixels+1);\
575 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
576 pixels+=line_size;\
577 block +=line_size;\
578 }\
579 }\
580 \
581 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
582 {\
583 int i;\
584 for(i=0; i<h; i++){\
585 const uint64_t a= LD64(pixels );\
586 const uint64_t b= LD64(pixels+1);\
587 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
588 pixels+=line_size;\
589 block +=line_size;\
590 }\
591 }\
592 \
593 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
594 {\
595 int i;\
596 for(i=0; i<h; i++){\
597 const uint64_t a= LD64(pixels );\
598 const uint64_t b= LD64(pixels+line_size);\
599 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
600 pixels+=line_size;\
601 block +=line_size;\
602 }\
603 }\
604 \
605 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
606 {\
607 int i;\
608 for(i=0; i<h; i++){\
609 const uint64_t a= LD64(pixels );\
610 const uint64_t b= LD64(pixels+line_size);\
611 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
612 pixels+=line_size;\
613 block +=line_size;\
614 }\
615 }\
616 \
617 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
618 {\
619 int i;\
620 const uint64_t a= LD64(pixels );\
621 const uint64_t b= LD64(pixels+1);\
622 uint64_t l0= (a&0x0303030303030303ULL)\
623 + (b&0x0303030303030303ULL)\
624 + 0x0202020202020202ULL;\
625 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
626 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
627 uint64_t l1,h1;\
628 \
629 pixels+=line_size;\
630 for(i=0; i<h; i+=2){\
631 uint64_t a= LD64(pixels );\
632 uint64_t b= LD64(pixels+1);\
633 l1= (a&0x0303030303030303ULL)\
634 + (b&0x0303030303030303ULL);\
635 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
636 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
637 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
638 pixels+=line_size;\
639 block +=line_size;\
640 a= LD64(pixels );\
641 b= LD64(pixels+1);\
642 l0= (a&0x0303030303030303ULL)\
643 + (b&0x0303030303030303ULL)\
644 + 0x0202020202020202ULL;\
645 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
646 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
647 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
648 pixels+=line_size;\
649 block +=line_size;\
650 }\
651 }\
652 \
653 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
654 {\
655 int i;\
656 const uint64_t a= LD64(pixels );\
657 const uint64_t b= LD64(pixels+1);\
658 uint64_t l0= (a&0x0303030303030303ULL)\
659 + (b&0x0303030303030303ULL)\
660 + 0x0101010101010101ULL;\
661 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
662 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
663 uint64_t l1,h1;\
664 \
665 pixels+=line_size;\
666 for(i=0; i<h; i+=2){\
667 uint64_t a= LD64(pixels );\
668 uint64_t b= LD64(pixels+1);\
669 l1= (a&0x0303030303030303ULL)\
670 + (b&0x0303030303030303ULL);\
671 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
672 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
673 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
674 pixels+=line_size;\
675 block +=line_size;\
676 a= LD64(pixels );\
677 b= LD64(pixels+1);\
678 l0= (a&0x0303030303030303ULL)\
679 + (b&0x0303030303030303ULL)\
680 + 0x0101010101010101ULL;\
681 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
682 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
683 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
684 pixels+=line_size;\
685 block +=line_size;\
686 }\
687 }\
688 \
689 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
690 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
691 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
692 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
693 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
694 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
695 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
696
697 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
698 #else // 64 bit variant
699
700 #define PIXOP2(OPNAME, OP) \
701 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
702 int i;\
703 for(i=0; i<h; i++){\
704 OP(*((uint16_t*)(block )), LD16(pixels ));\
705 pixels+=line_size;\
706 block +=line_size;\
707 }\
708 }\
709 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
710 int i;\
711 for(i=0; i<h; i++){\
712 OP(*((uint32_t*)(block )), LD32(pixels ));\
713 pixels+=line_size;\
714 block +=line_size;\
715 }\
716 }\
717 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
718 int i;\
719 for(i=0; i<h; i++){\
720 OP(*((uint32_t*)(block )), LD32(pixels ));\
721 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
722 pixels+=line_size;\
723 block +=line_size;\
724 }\
725 }\
726 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
727 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
728 }\
729 \
730 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
731 int src_stride1, int src_stride2, int h){\
732 int i;\
733 for(i=0; i<h; i++){\
734 uint32_t a,b;\
735 a= LD32(&src1[i*src_stride1 ]);\
736 b= LD32(&src2[i*src_stride2 ]);\
737 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
738 a= LD32(&src1[i*src_stride1+4]);\
739 b= LD32(&src2[i*src_stride2+4]);\
740 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
741 }\
742 }\
743 \
744 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
745 int src_stride1, int src_stride2, int h){\
746 int i;\
747 for(i=0; i<h; i++){\
748 uint32_t a,b;\
749 a= LD32(&src1[i*src_stride1 ]);\
750 b= LD32(&src2[i*src_stride2 ]);\
751 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
752 a= LD32(&src1[i*src_stride1+4]);\
753 b= LD32(&src2[i*src_stride2+4]);\
754 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
755 }\
756 }\
757 \
758 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
759 int src_stride1, int src_stride2, int h){\
760 int i;\
761 for(i=0; i<h; i++){\
762 uint32_t a,b;\
763 a= LD32(&src1[i*src_stride1 ]);\
764 b= LD32(&src2[i*src_stride2 ]);\
765 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
766 }\
767 }\
768 \
769 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
770 int src_stride1, int src_stride2, int h){\
771 int i;\
772 for(i=0; i<h; i++){\
773 uint32_t a,b;\
774 a= LD16(&src1[i*src_stride1 ]);\
775 b= LD16(&src2[i*src_stride2 ]);\
776 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
777 }\
778 }\
779 \
780 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
781 int src_stride1, int src_stride2, int h){\
782 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
783 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
784 }\
785 \
786 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
787 int src_stride1, int src_stride2, int h){\
788 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
789 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
790 }\
791 \
792 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
793 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
794 }\
795 \
796 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
797 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
798 }\
799 \
800 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
801 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
802 }\
803 \
804 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
805 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
806 }\
807 \
808 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
809 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
810 int i;\
811 for(i=0; i<h; i++){\
812 uint32_t a, b, c, d, l0, l1, h0, h1;\
813 a= LD32(&src1[i*src_stride1]);\
814 b= LD32(&src2[i*src_stride2]);\
815 c= LD32(&src3[i*src_stride3]);\
816 d= LD32(&src4[i*src_stride4]);\
817 l0= (a&0x03030303UL)\
818 + (b&0x03030303UL)\
819 + 0x02020202UL;\
820 h0= ((a&0xFCFCFCFCUL)>>2)\
821 + ((b&0xFCFCFCFCUL)>>2);\
822 l1= (c&0x03030303UL)\
823 + (d&0x03030303UL);\
824 h1= ((c&0xFCFCFCFCUL)>>2)\
825 + ((d&0xFCFCFCFCUL)>>2);\
826 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
827 a= LD32(&src1[i*src_stride1+4]);\
828 b= LD32(&src2[i*src_stride2+4]);\
829 c= LD32(&src3[i*src_stride3+4]);\
830 d= LD32(&src4[i*src_stride4+4]);\
831 l0= (a&0x03030303UL)\
832 + (b&0x03030303UL)\
833 + 0x02020202UL;\
834 h0= ((a&0xFCFCFCFCUL)>>2)\
835 + ((b&0xFCFCFCFCUL)>>2);\
836 l1= (c&0x03030303UL)\
837 + (d&0x03030303UL);\
838 h1= ((c&0xFCFCFCFCUL)>>2)\
839 + ((d&0xFCFCFCFCUL)>>2);\
840 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
841 }\
842 }\
843 \
844 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
845 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
846 }\
847 \
848 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
849 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
850 }\
851 \
852 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
853 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
854 }\
855 \
856 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
857 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
858 }\
859 \
860 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
861 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
862 int i;\
863 for(i=0; i<h; i++){\
864 uint32_t a, b, c, d, l0, l1, h0, h1;\
865 a= LD32(&src1[i*src_stride1]);\
866 b= LD32(&src2[i*src_stride2]);\
867 c= LD32(&src3[i*src_stride3]);\
868 d= LD32(&src4[i*src_stride4]);\
869 l0= (a&0x03030303UL)\
870 + (b&0x03030303UL)\
871 + 0x01010101UL;\
872 h0= ((a&0xFCFCFCFCUL)>>2)\
873 + ((b&0xFCFCFCFCUL)>>2);\
874 l1= (c&0x03030303UL)\
875 + (d&0x03030303UL);\
876 h1= ((c&0xFCFCFCFCUL)>>2)\
877 + ((d&0xFCFCFCFCUL)>>2);\
878 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
879 a= LD32(&src1[i*src_stride1+4]);\
880 b= LD32(&src2[i*src_stride2+4]);\
881 c= LD32(&src3[i*src_stride3+4]);\
882 d= LD32(&src4[i*src_stride4+4]);\
883 l0= (a&0x03030303UL)\
884 + (b&0x03030303UL)\
885 + 0x01010101UL;\
886 h0= ((a&0xFCFCFCFCUL)>>2)\
887 + ((b&0xFCFCFCFCUL)>>2);\
888 l1= (c&0x03030303UL)\
889 + (d&0x03030303UL);\
890 h1= ((c&0xFCFCFCFCUL)>>2)\
891 + ((d&0xFCFCFCFCUL)>>2);\
892 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
893 }\
894 }\
895 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
896 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
897 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
898 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
899 }\
900 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
901 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
902 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
903 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
904 }\
905 \
906 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
907 {\
908 int i, a0, b0, a1, b1;\
909 a0= pixels[0];\
910 b0= pixels[1] + 2;\
911 a0 += b0;\
912 b0 += pixels[2];\
913 \
914 pixels+=line_size;\
915 for(i=0; i<h; i+=2){\
916 a1= pixels[0];\
917 b1= pixels[1];\
918 a1 += b1;\
919 b1 += pixels[2];\
920 \
921 block[0]= (a1+a0)>>2; /* FIXME non put */\
922 block[1]= (b1+b0)>>2;\
923 \
924 pixels+=line_size;\
925 block +=line_size;\
926 \
927 a0= pixels[0];\
928 b0= pixels[1] + 2;\
929 a0 += b0;\
930 b0 += pixels[2];\
931 \
932 block[0]= (a1+a0)>>2;\
933 block[1]= (b1+b0)>>2;\
934 pixels+=line_size;\
935 block +=line_size;\
936 }\
937 }\
938 \
939 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
940 {\
941 int i;\
942 const uint32_t a= LD32(pixels );\
943 const uint32_t b= LD32(pixels+1);\
944 uint32_t l0= (a&0x03030303UL)\
945 + (b&0x03030303UL)\
946 + 0x02020202UL;\
947 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
948 + ((b&0xFCFCFCFCUL)>>2);\
949 uint32_t l1,h1;\
950 \
951 pixels+=line_size;\
952 for(i=0; i<h; i+=2){\
953 uint32_t a= LD32(pixels );\
954 uint32_t b= LD32(pixels+1);\
955 l1= (a&0x03030303UL)\
956 + (b&0x03030303UL);\
957 h1= ((a&0xFCFCFCFCUL)>>2)\
958 + ((b&0xFCFCFCFCUL)>>2);\
959 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
960 pixels+=line_size;\
961 block +=line_size;\
962 a= LD32(pixels );\
963 b= LD32(pixels+1);\
964 l0= (a&0x03030303UL)\
965 + (b&0x03030303UL)\
966 + 0x02020202UL;\
967 h0= ((a&0xFCFCFCFCUL)>>2)\
968 + ((b&0xFCFCFCFCUL)>>2);\
969 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
970 pixels+=line_size;\
971 block +=line_size;\
972 }\
973 }\
974 \
975 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
976 {\
977 int j;\
978 for(j=0; j<2; j++){\
979 int i;\
980 const uint32_t a= LD32(pixels );\
981 const uint32_t b= LD32(pixels+1);\
982 uint32_t l0= (a&0x03030303UL)\
983 + (b&0x03030303UL)\
984 + 0x02020202UL;\
985 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
986 + ((b&0xFCFCFCFCUL)>>2);\
987 uint32_t l1,h1;\
988 \
989 pixels+=line_size;\
990 for(i=0; i<h; i+=2){\
991 uint32_t a= LD32(pixels );\
992 uint32_t b= LD32(pixels+1);\
993 l1= (a&0x03030303UL)\
994 + (b&0x03030303UL);\
995 h1= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
997 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
998 pixels+=line_size;\
999 block +=line_size;\
1000 a= LD32(pixels );\
1001 b= LD32(pixels+1);\
1002 l0= (a&0x03030303UL)\
1003 + (b&0x03030303UL)\
1004 + 0x02020202UL;\
1005 h0= ((a&0xFCFCFCFCUL)>>2)\
1006 + ((b&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008 pixels+=line_size;\
1009 block +=line_size;\
1010 }\
1011 pixels+=4-line_size*(h+1);\
1012 block +=4-line_size*h;\
1013 }\
1014 }\
1015 \
1016 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1017 {\
1018 int j;\
1019 for(j=0; j<2; j++){\
1020 int i;\
1021 const uint32_t a= LD32(pixels );\
1022 const uint32_t b= LD32(pixels+1);\
1023 uint32_t l0= (a&0x03030303UL)\
1024 + (b&0x03030303UL)\
1025 + 0x01010101UL;\
1026 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1027 + ((b&0xFCFCFCFCUL)>>2);\
1028 uint32_t l1,h1;\
1029 \
1030 pixels+=line_size;\
1031 for(i=0; i<h; i+=2){\
1032 uint32_t a= LD32(pixels );\
1033 uint32_t b= LD32(pixels+1);\
1034 l1= (a&0x03030303UL)\
1035 + (b&0x03030303UL);\
1036 h1= ((a&0xFCFCFCFCUL)>>2)\
1037 + ((b&0xFCFCFCFCUL)>>2);\
1038 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1039 pixels+=line_size;\
1040 block +=line_size;\
1041 a= LD32(pixels );\
1042 b= LD32(pixels+1);\
1043 l0= (a&0x03030303UL)\
1044 + (b&0x03030303UL)\
1045 + 0x01010101UL;\
1046 h0= ((a&0xFCFCFCFCUL)>>2)\
1047 + ((b&0xFCFCFCFCUL)>>2);\
1048 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1049 pixels+=line_size;\
1050 block +=line_size;\
1051 }\
1052 pixels+=4-line_size*(h+1);\
1053 block +=4-line_size*h;\
1054 }\
1055 }\
1056 \
1057 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1058 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1059 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1060 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1061 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1062 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1063 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1064 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1065
1066 #define op_avg(a, b) a = rnd_avg32(a, b)
1067 #endif
1068 #define op_put(a, b) a = b
1069
1070 PIXOP2(avg, op_avg)
1071 PIXOP2(put, op_put)
1072 #undef op_avg
1073 #undef op_put
1074
1075 #define avg2(a,b) ((a+b+1)>>1)
1076 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1077
1078 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1079 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1080 }
1081
1082 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1083 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1084 }
1085
1086 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1087 {
1088 const int A=(16-x16)*(16-y16);
1089 const int B=( x16)*(16-y16);
1090 const int C=(16-x16)*( y16);
1091 const int D=( x16)*( y16);
1092 int i;
1093
1094 for(i=0; i<h; i++)
1095 {
1096 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1097 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1098 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1099 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1100 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1101 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1102 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1103 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1104 dst+= stride;
1105 src+= stride;
1106 }
1107 }
1108
1109 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1110 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1111 {
1112 int y, vx, vy;
1113 const int s= 1<<shift;
1114
1115 width--;
1116 height--;
1117
1118 for(y=0; y<h; y++){
1119 int x;
1120
1121 vx= ox;
1122 vy= oy;
1123 for(x=0; x<8; x++){ //XXX FIXME optimize
1124 int src_x, src_y, frac_x, frac_y, index;
1125
1126 src_x= vx>>16;
1127 src_y= vy>>16;
1128 frac_x= src_x&(s-1);
1129 frac_y= src_y&(s-1);
1130 src_x>>=shift;
1131 src_y>>=shift;
1132
1133 if((unsigned)src_x < width){
1134 if((unsigned)src_y < height){
1135 index= src_x + src_y*stride;
1136 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1137 + src[index +1]* frac_x )*(s-frac_y)
1138 + ( src[index+stride ]*(s-frac_x)
1139 + src[index+stride+1]* frac_x )* frac_y
1140 + r)>>(shift*2);
1141 }else{
1142 index= src_x + clip(src_y, 0, height)*stride;
1143 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1144 + src[index +1]* frac_x )*s
1145 + r)>>(shift*2);
1146 }
1147 }else{
1148 if((unsigned)src_y < height){
1149 index= clip(src_x, 0, width) + src_y*stride;
1150 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1151 + src[index+stride ]* frac_y )*s
1152 + r)>>(shift*2);
1153 }else{
1154 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1155 dst[y*stride + x]= src[index ];
1156 }
1157 }
1158
1159 vx+= dxx;
1160 vy+= dyx;
1161 }
1162 ox += dxy;
1163 oy += dyy;
1164 }
1165 }
1166
1167 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1168 switch(width){
1169 case 2: put_pixels2_c (dst, src, stride, height); break;
1170 case 4: put_pixels4_c (dst, src, stride, height); break;
1171 case 8: put_pixels8_c (dst, src, stride, height); break;
1172 case 16:put_pixels16_c(dst, src, stride, height); break;
1173 }
1174 }
1175
1176 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1177 int i,j;
1178 for (i=0; i < height; i++) {
1179 for (j=0; j < width; j++) {
1180 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1181 }
1182 src += stride;
1183 dst += stride;
1184 }
1185 }
1186
1187 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1188 int i,j;
1189 for (i=0; i < height; i++) {
1190 for (j=0; j < width; j++) {
1191 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1192 }
1193 src += stride;
1194 dst += stride;
1195 }
1196 }
1197
1198 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1199 int i,j;
1200 for (i=0; i < height; i++) {
1201 for (j=0; j < width; j++) {
1202 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1203 }
1204 src += stride;
1205 dst += stride;
1206 }
1207 }
1208
1209 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1210 int i,j;
1211 for (i=0; i < height; i++) {
1212 for (j=0; j < width; j++) {
1213 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1214 }
1215 src += stride;
1216 dst += stride;
1217 }
1218 }
1219
1220 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1221 int i,j;
1222 for (i=0; i < height; i++) {
1223 for (j=0; j < width; j++) {
1224 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1225 }
1226 src += stride;
1227 dst += stride;
1228 }
1229 }
1230
1231 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1232 int i,j;
1233 for (i=0; i < height; i++) {
1234 for (j=0; j < width; j++) {
1235 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1236 }
1237 src += stride;
1238 dst += stride;
1239 }
1240 }
1241
1242 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1243 int i,j;
1244 for (i=0; i < height; i++) {
1245 for (j=0; j < width; j++) {
1246 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1247 }
1248 src += stride;
1249 dst += stride;
1250 }
1251 }
1252
1253 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1254 int i,j;
1255 for (i=0; i < height; i++) {
1256 for (j=0; j < width; j++) {
1257 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1258 }
1259 src += stride;
1260 dst += stride;
1261 }
1262 }
1263
1264 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1265 switch(width){
1266 case 2: avg_pixels2_c (dst, src, stride, height); break;
1267 case 4: avg_pixels4_c (dst, src, stride, height); break;
1268 case 8: avg_pixels8_c (dst, src, stride, height); break;
1269 case 16:avg_pixels16_c(dst, src, stride, height); break;
1270 }
1271 }
1272
1273 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1274 int i,j;
1275 for (i=0; i < height; i++) {
1276 for (j=0; j < width; j++) {
1277 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1278 }
1279 src += stride;
1280 dst += stride;
1281 }
1282 }
1283
1284 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1285 int i,j;
1286 for (i=0; i < height; i++) {
1287 for (j=0; j < width; j++) {
1288 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1289 }
1290 src += stride;
1291 dst += stride;
1292 }
1293 }
1294
1295 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1296 int i,j;
1297 for (i=0; i < height; i++) {
1298 for (j=0; j < width; j++) {
1299 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1300 }
1301 src += stride;
1302 dst += stride;
1303 }
1304 }
1305
1306 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1307 int i,j;
1308 for (i=0; i < height; i++) {
1309 for (j=0; j < width; j++) {
1310 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1311 }
1312 src += stride;
1313 dst += stride;
1314 }
1315 }
1316
1317 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1318 int i,j;
1319 for (i=0; i < height; i++) {
1320 for (j=0; j < width; j++) {
1321 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1322 }
1323 src += stride;
1324 dst += stride;
1325 }
1326 }
1327
1328 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1329 int i,j;
1330 for (i=0; i < height; i++) {
1331 for (j=0; j < width; j++) {
1332 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1333 }
1334 src += stride;
1335 dst += stride;
1336 }
1337 }
1338
1339 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1340 int i,j;
1341 for (i=0; i < height; i++) {
1342 for (j=0; j < width; j++) {
1343 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1344 }
1345 src += stride;
1346 dst += stride;
1347 }
1348 }
1349
1350 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1351 int i,j;
1352 for (i=0; i < height; i++) {
1353 for (j=0; j < width; j++) {
1354 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1355 }
1356 src += stride;
1357 dst += stride;
1358 }
1359 }
1360 #if 0
1361 #define TPEL_WIDTH(width)\
1362 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1363 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1364 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1365 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1366 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1367 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1368 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1369 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1370 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1371 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1372 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1373 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1374 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1375 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1376 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1377 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1378 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1379 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1380 #endif
1381
1382 #define H264_CHROMA_MC(OPNAME, OP)\
1383 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1384 const int A=(8-x)*(8-y);\
1385 const int B=( x)*(8-y);\
1386 const int C=(8-x)*( y);\
1387 const int D=( x)*( y);\
1388 int i;\
1389 \
1390 assert(x<8 && y<8 && x>=0 && y>=0);\
1391 \
1392 for(i=0; i<h; i++)\
1393 {\
1394 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1395 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1396 dst+= stride;\
1397 src+= stride;\
1398 }\
1399 }\
1400 \
1401 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1402 const int A=(8-x)*(8-y);\
1403 const int B=( x)*(8-y);\
1404 const int C=(8-x)*( y);\
1405 const int D=( x)*( y);\
1406 int i;\
1407 \
1408 assert(x<8 && y<8 && x>=0 && y>=0);\
1409 \
1410 for(i=0; i<h; i++)\
1411 {\
1412 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1413 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1414 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1415 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1416 dst+= stride;\
1417 src+= stride;\
1418 }\
1419 }\
1420 \
1421 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1422 const int A=(8-x)*(8-y);\
1423 const int B=( x)*(8-y);\
1424 const int C=(8-x)*( y);\
1425 const int D=( x)*( y);\
1426 int i;\
1427 \
1428 assert(x<8 && y<8 && x>=0 && y>=0);\
1429 \
1430 for(i=0; i<h; i++)\
1431 {\
1432 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1433 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1434 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1435 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1436 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1437 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1438 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1439 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1440 dst+= stride;\
1441 src+= stride;\
1442 }\
1443 }
1444
1445 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1446 #define op_put(a, b) a = (((b) + 32)>>6)
1447
1448 H264_CHROMA_MC(put_ , op_put)
1449 H264_CHROMA_MC(avg_ , op_avg)
1450 #undef op_avg
1451 #undef op_put
1452
1453 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1454 {
1455 int i;
1456 for(i=0; i<h; i++)
1457 {
1458 ST32(dst , LD32(src ));
1459 dst+=dstStride;
1460 src+=srcStride;
1461 }
1462 }
1463
1464 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1465 {
1466 int i;
1467 for(i=0; i<h; i++)
1468 {
1469 ST32(dst , LD32(src ));
1470 ST32(dst+4 , LD32(src+4 ));
1471 dst+=dstStride;
1472 src+=srcStride;
1473 }
1474 }
1475
1476 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1477 {
1478 int i;
1479 for(i=0; i<h; i++)
1480 {
1481 ST32(dst , LD32(src ));
1482 ST32(dst+4 , LD32(src+4 ));
1483 ST32(dst+8 , LD32(src+8 ));
1484 ST32(dst+12, LD32(src+12));
1485 dst+=dstStride;
1486 src+=srcStride;
1487 }
1488 }
1489
1490 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1491 {
1492 int i;
1493 for(i=0; i<h; i++)
1494 {
1495 ST32(dst , LD32(src ));
1496 ST32(dst+4 , LD32(src+4 ));
1497 ST32(dst+8 , LD32(src+8 ));
1498 ST32(dst+12, LD32(src+12));
1499 dst[16]= src[16];
1500 dst+=dstStride;
1501 src+=srcStride;
1502 }
1503 }
1504
1505 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1506 {
1507 int i;
1508 for(i=0; i<h; i++)
1509 {
1510 ST32(dst , LD32(src ));
1511 ST32(dst+4 , LD32(src+4 ));
1512 dst[8]= src[8];
1513 dst+=dstStride;
1514 src+=srcStride;
1515 }
1516 }
1517
1518
1519 #define QPEL_MC(r, OPNAME, RND, OP) \
1520 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1521 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1522 int i;\
1523 for(i=0; i<h; i++)\
1524 {\
1525 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1526 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1527 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1528 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1529 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1530 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1531 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1532 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1533 dst+=dstStride;\
1534 src+=srcStride;\
1535 }\
1536 }\
1537 \
1538 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1539 const int w=8;\
1540 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1541 int i;\
1542 for(i=0; i<w; i++)\
1543 {\
1544 const int src0= src[0*srcStride];\
1545 const int src1= src[1*srcStride];\
1546 const int src2= src[2*srcStride];\
1547 const int src3= src[3*srcStride];\
1548 const int src4= src[4*srcStride];\
1549 const int src5= src[5*srcStride];\
1550 const int src6= src[6*srcStride];\
1551 const int src7= src[7*srcStride];\
1552 const int src8= src[8*srcStride];\
1553 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1554 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1555 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1556 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1557 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1558 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1559 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1560 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1561 dst++;\
1562 src++;\
1563 }\
1564 }\
1565 \
1566 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1567 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1568 int i;\
1569 \
1570 for(i=0; i<h; i++)\
1571 {\
1572 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1573 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1574 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1575 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1576 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1577 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1578 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1579 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1580 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1581 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1582 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1583 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1584 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1585 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1586 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1587 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1588 dst+=dstStride;\
1589 src+=srcStride;\
1590 }\
1591 }\
1592 \
1593 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1594 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1595 int i;\
1596 const int w=16;\
1597 for(i=0; i<w; i++)\
1598 {\
1599 const int src0= src[0*srcStride];\
1600 const int src1= src[1*srcStride];\
1601 const int src2= src[2*srcStride];\
1602 const int src3= src[3*srcStride];\
1603 const int src4= src[4*srcStride];\
1604 const int src5= src[5*srcStride];\
1605 const int src6= src[6*srcStride];\
1606 const int src7= src[7*srcStride];\
1607 const int src8= src[8*srcStride];\
1608 const int src9= src[9*srcStride];\
1609 const int src10= src[10*srcStride];\
1610 const int src11= src[11*srcStride];\
1611 const int src12= src[12*srcStride];\
1612 const int src13= src[13*srcStride];\
1613 const int src14= src[14*srcStride];\
1614 const int src15= src[15*srcStride];\
1615 const int src16= src[16*srcStride];\
1616 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1617 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1618 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1619 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1620 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1621 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1622 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1623 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1624 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1625 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1626 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1627 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1628 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1629 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1630 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1631 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1632 dst++;\
1633 src++;\
1634 }\
1635 }\
1636 \
1637 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1638 OPNAME ## pixels8_c(dst, src, stride, 8);\
1639 }\
1640 \
1641 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1642 uint8_t half[64];\
1643 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1644 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1645 }\
1646 \
1647 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1648 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1649 }\
1650 \
1651 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1652 uint8_t half[64];\
1653 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1654 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1655 }\
1656 \
1657 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1658 uint8_t full[16*9];\
1659 uint8_t half[64];\
1660 copy_block9(full, src, 16, stride, 9);\
1661 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1662 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1663 }\
1664 \
1665 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1666 uint8_t full[16*9];\
1667 copy_block9(full, src, 16, stride, 9);\
1668 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1669 }\
1670 \
1671 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1672 uint8_t full[16*9];\
1673 uint8_t half[64];\
1674 copy_block9(full, src, 16, stride, 9);\
1675 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1676 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1677 }\
1678 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1679 uint8_t full[16*9];\
1680 uint8_t halfH[72];\
1681 uint8_t halfV[64];\
1682 uint8_t halfHV[64];\
1683 copy_block9(full, src, 16, stride, 9);\
1684 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1685 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1686 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1687 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1688 }\
1689 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1690 uint8_t full[16*9];\
1691 uint8_t halfH[72];\
1692 uint8_t halfHV[64];\
1693 copy_block9(full, src, 16, stride, 9);\
1694 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1695 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1696 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1697 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1698 }\
1699 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1700 uint8_t full[16*9];\
1701 uint8_t halfH[72];\
1702 uint8_t halfV[64];\
1703 uint8_t halfHV[64];\
1704 copy_block9(full, src, 16, stride, 9);\
1705 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1706 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1707 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1708 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1709 }\
1710 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
1712 uint8_t halfH[72];\
1713 uint8_t halfHV[64];\
1714 copy_block9(full, src, 16, stride, 9);\
1715 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1716 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1717 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1718 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1719 }\
1720 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1721 uint8_t full[16*9];\
1722 uint8_t halfH[72];\
1723 uint8_t halfV[64];\
1724 uint8_t halfHV[64];\
1725 copy_block9(full, src, 16, stride, 9);\
1726 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1727 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1728 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1729 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1730 }\
1731 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1732 uint8_t full[16*9];\
1733 uint8_t halfH[72];\
1734 uint8_t halfHV[64];\
1735 copy_block9(full, src, 16, stride, 9);\
1736 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1737 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1738 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1739 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1740 }\
1741 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1742 uint8_t full[16*9];\
1743 uint8_t halfH[72];\
1744 uint8_t halfV[64];\
1745 uint8_t halfHV[64];\
1746 copy_block9(full, src, 16, stride, 9);\
1747 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1748 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1749 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1750 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1751 }\
1752 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1753 uint8_t full[16*9];\
1754 uint8_t halfH[72];\
1755 uint8_t halfHV[64];\
1756 copy_block9(full, src, 16, stride, 9);\
1757 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1758 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1759 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1760 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1761 }\
1762 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1763 uint8_t halfH[72];\
1764 uint8_t halfHV[64];\
1765 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1766 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1768 }\
1769 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t halfH[72];\
1771 uint8_t halfHV[64];\
1772 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1773 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1774 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1775 }\
1776 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t full[16*9];\
1778 uint8_t halfH[72];\
1779 uint8_t halfV[64];\
1780 uint8_t halfHV[64];\
1781 copy_block9(full, src, 16, stride, 9);\
1782 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1783 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1784 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1785 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1786 }\
1787 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1788 uint8_t full[16*9];\
1789 uint8_t halfH[72];\
1790 copy_block9(full, src, 16, stride, 9);\
1791 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1792 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1793 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1794 }\
1795 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1796 uint8_t full[16*9];\
1797 uint8_t halfH[72];\
1798 uint8_t halfV[64];\
1799 uint8_t halfHV[64];\
1800 copy_block9(full, src, 16, stride, 9);\
1801 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1802 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1803 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1804 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1805 }\
1806 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1807 uint8_t full[16*9];\
1808 uint8_t halfH[72];\
1809 copy_block9(full, src, 16, stride, 9);\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1812 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1813 }\
1814 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1815 uint8_t halfH[72];\
1816 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1817 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1818 }\
1819 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1820 OPNAME ## pixels16_c(dst, src, stride, 16);\
1821 }\
1822 \
1823 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1824 uint8_t half[256];\
1825 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1826 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1827 }\
1828 \
1829 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1830 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1831 }\
1832 \
1833 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1834 uint8_t half[256];\
1835 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1836 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1837 }\
1838 \
1839 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1840 uint8_t full[24*17];\
1841 uint8_t half[256];\
1842 copy_block17(full, src, 24, stride, 17);\
1843 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1844 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1845 }\
1846 \
1847 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1848 uint8_t full[24*17];\
1849 copy_block17(full, src, 24, stride, 17);\
1850 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1851 }\
1852 \
1853 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t full[24*17];\
1855 uint8_t half[256];\
1856 copy_block17(full, src, 24, stride, 17);\
1857 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1858 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1859 }\
1860 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1861 uint8_t full[24*17];\
1862 uint8_t halfH[272];\
1863 uint8_t halfV[256];\
1864 uint8_t halfHV[256];\
1865 copy_block17(full, src, 24, stride, 17);\
1866 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1867 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1868 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1869 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1870 }\
1871 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1872 uint8_t full[24*17];\
1873 uint8_t halfH[272];\
1874 uint8_t halfHV[256];\
1875 copy_block17(full, src, 24, stride, 17);\
1876 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1877 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1878 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1879 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1880 }\
1881 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1882 uint8_t full[24*17];\
1883 uint8_t halfH[272];\
1884 uint8_t halfV[256];\
1885 uint8_t halfHV[256];\
1886 copy_block17(full, src, 24, stride, 17);\
1887 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1888 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1889 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1890 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1891 }\
1892 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
1894 uint8_t halfH[272];\
1895 uint8_t halfHV[256];\
1896 copy_block17(full, src, 24, stride, 17);\
1897 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1898 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1899 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1900 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1901 }\
1902 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1903 uint8_t full[24*17];\
1904 uint8_t halfH[272];\
1905 uint8_t halfV[256];\
1906 uint8_t halfHV[256];\
1907 copy_block17(full, src, 24, stride, 17);\
1908 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1909 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1910 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1911 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1912 }\
1913 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t full[24*17];\
1915 uint8_t halfH[272];\
1916 uint8_t halfHV[256];\
1917 copy_block17(full, src, 24, stride, 17);\
1918 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1919 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1920 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1921 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1922 }\
1923 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1924 uint8_t full[24*17];\
1925 uint8_t halfH[272];\
1926 uint8_t halfV[256];\
1927 uint8_t halfHV[256];\
1928 copy_block17(full, src, 24, stride, 17);\
1929 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1930 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1931 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1932 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1933 }\
1934 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1935 uint8_t full[24*17];\
1936 uint8_t halfH[272];\
1937 uint8_t halfHV[256];\
1938 copy_block17(full, src, 24, stride, 17);\
1939 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1940 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1941 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1942 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1943 }\
1944 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1945 uint8_t halfH[272];\
1946 uint8_t halfHV[256];\
1947 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1948 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1950 }\
1951 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1952 uint8_t halfH[272];\
1953 uint8_t halfHV[256];\
1954 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1955 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1957 }\
1958 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1959 uint8_t full[24*17];\
1960 uint8_t halfH[272];\
1961 uint8_t halfV[256];\
1962 uint8_t halfHV[256];\
1963 copy_block17(full, src, 24, stride, 17);\
1964 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1965 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1966 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1967 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1968 }\
1969 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1970 uint8_t full[24*17];\
1971 uint8_t halfH[272];\
1972 copy_block17(full, src, 24, stride, 17);\
1973 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1974 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1975 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1976 }\
1977 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1978 uint8_t full[24*17];\
1979 uint8_t halfH[272];\
1980 uint8_t halfV[256];\
1981 uint8_t halfHV[256];\
1982 copy_block17(full, src, 24, stride, 17);\
1983 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1984 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1985 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1986 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1987 }\
1988 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1989 uint8_t full[24*17];\
1990 uint8_t halfH[272];\
1991 copy_block17(full, src, 24, stride, 17);\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1994 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1995 }\
1996 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1997 uint8_t halfH[272];\
1998 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1999 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2000 }
2001
2002 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2003 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2004 #define op_put(a, b) a = cm[((b) + 16)>>5]
2005 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2006
2007 QPEL_MC(0, put_ , _ , op_put)
2008 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2009 QPEL_MC(0, avg_ , _ , op_avg)
2010 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2011 #undef op_avg
2012 #undef op_avg_no_rnd
2013 #undef op_put
2014 #undef op_put_no_rnd
2015
2016 #if 1
2017 #define H264_LOWPASS(OPNAME, OP, OP2) \
2018 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2019 const int h=4;\
2020 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2021 int i;\
2022 for(i=0; i<h; i++)\
2023 {\
2024 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2025 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2026 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2027 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2028 dst+=dstStride;\
2029 src+=srcStride;\
2030 }\
2031 }\
2032 \
2033 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2034 const int w=4;\
2035 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2036 int i;\
2037 for(i=0; i<w; i++)\
2038 {\
2039 const int srcB= src[-2*srcStride];\
2040 const int srcA= src[-1*srcStride];\
2041 const int src0= src[0 *srcStride];\
2042 const int src1= src[1 *srcStride];\
2043 const int src2= src[2 *srcStride];\
2044 const int src3= src[3 *srcStride];\
2045 const int src4= src[4 *srcStride];\
2046 const int src5= src[5 *srcStride];\
2047 const int src6= src[6 *srcStride];\
2048 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2049 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2050 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2051 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2052 dst++;\
2053 src++;\
2054 }\
2055 }\
2056 \
2057 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2058 const int h=4;\
2059 const int w=4;\
2060 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2061 int i;\
2062 src -= 2*srcStride;\
2063 for(i=0; i<h+5; i++)\
2064 {\
2065 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2066 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2067 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2068 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2069 tmp+=tmpStride;\
2070 src+=srcStride;\
2071 }\
2072 tmp -= tmpStride*(h+5-2);\
2073 for(i=0; i<w; i++)\
2074 {\
2075 const int tmpB= tmp[-2*tmpStride];\
2076 const int tmpA= tmp[-1*tmpStride];\
2077 const int tmp0= tmp[0 *tmpStride];\
2078 const int tmp1= tmp[1 *tmpStride];\
2079 const int tmp2= tmp[2 *tmpStride];\
2080 const int tmp3= tmp[3 *tmpStride];\
2081 const int tmp4= tmp[4 *tmpStride];\
2082 const int tmp5= tmp[5 *tmpStride];\
2083 const int tmp6= tmp[6 *tmpStride];\
2084 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2085 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2086 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2087 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2088 dst++;\
2089 tmp++;\
2090 }\
2091 }\
2092 \
2093 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2094 const int h=8;\
2095 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2096 int i;\
2097 for(i=0; i<h; i++)\
2098 {\
2099 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2100 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2101 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2102 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2103 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2104 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2105 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2106 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2107 dst+=dstStride;\
2108 src+=srcStride;\
2109 }\
2110 }\
2111 \
2112 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2113 const int w=8;\
2114 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2115 int i;\
2116 for(i=0; i<w; i++)\
2117 {\
2118 const int srcB= src[-2*srcStride];\
2119 const int srcA= src[-1*srcStride];\
2120 const int src0= src[0 *srcStride];\
2121 const int src1= src[1 *srcStride];\
2122 const int src2= src[2 *srcStride];\
2123 const int src3= src[3 *srcStride];\
2124 const int src4= src[4 *srcStride];\
2125 const int src5= src[5 *srcStride];\
2126 const int src6= src[6 *srcStride];\
2127 const int src7= src[7 *srcStride];\
2128 const int src8= src[8 *srcStride];\
2129 const int src9= src[9 *srcStride];\
2130 const int src10=src[10*srcStride];\
2131 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2132 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2133 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2134 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2135 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2136 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2137 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2138 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2139 dst++;\
2140 src++;\
2141 }\
2142 }\
2143 \
2144 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2145 const int h=8;\
2146 const int w=8;\
2147 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2148 int i;\
2149 src -= 2*srcStride;\
2150 for(i=0; i<h+5; i++)\
2151 {\
2152 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2153 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2154 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2155 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2156 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2157 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2158 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2159 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2160 tmp+=tmpStride;\
2161 src+=srcStride;\
2162 }\
2163 tmp -= tmpStride*(h+5-2);\
2164 for(i=0; i<w; i++)\
2165 {\
2166 const int tmpB= tmp[-2*tmpStride];\
2167 const int tmpA= tmp[-1*tmpStride];\
2168 const int tmp0= tmp[0 *tmpStride];\
2169 const int tmp1= tmp[1 *tmpStride];\
2170 const int tmp2= tmp[2 *tmpStride];\
2171 const int tmp3= tmp[3 *tmpStride];\
2172 const int tmp4= tmp[4 *tmpStride];\
2173 const int tmp5= tmp[5 *tmpStride];\
2174 const int tmp6= tmp[6 *tmpStride];\
2175 const int tmp7= tmp[7 *tmpStride];\
2176 const int tmp8= tmp[8 *tmpStride];\
2177 const int tmp9= tmp[9 *tmpStride];\
2178 const int tmp10=tmp[10*tmpStride];\
2179 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2180 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2181 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2182 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2183 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2184 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2185 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2186 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2187 dst++;\
2188 tmp++;\
2189 }\
2190 }\
2191 \
2192 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2193 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2194 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2195 src += 8*srcStride;\
2196 dst += 8*dstStride;\
2197 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2198 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2199 }\
2200 \
2201 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2202 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2203 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2204 src += 8*srcStride;\
2205 dst += 8*dstStride;\
2206 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2207 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2208 }\
2209 \
2210 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2211 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2212 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2213 src += 8*srcStride;\
2214 dst += 8*dstStride;\
2215 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2216 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2217 }\
2218
2219 #define H264_MC(OPNAME, SIZE) \
2220 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2221 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2222 }\
2223 \
2224 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2225 uint8_t half[SIZE*SIZE];\
2226 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2227 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2228 }\
2229 \
2230 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2231 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2232 }\
2233 \
2234 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2235 uint8_t half[SIZE*SIZE];\
2236 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2237 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2238 }\
2239 \
2240 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2241 uint8_t full[SIZE*(SIZE+5)];\
2242 uint8_t * const full_mid= full + SIZE*2;\
2243 uint8_t half[SIZE*SIZE];\
2244 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2245 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2246 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2247 }\
2248 \
2249 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2250 uint8_t full[SIZE*(SIZE+5)];\
2251 uint8_t * const full_mid= full + SIZE*2;\
2252 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2253 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2254 }\
2255 \
2256 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2257 uint8_t full[SIZE*(SIZE+5)];\
2258 uint8_t * const full_mid= full + SIZE*2;\
2259 uint8_t half[SIZE*SIZE];\
2260 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2261 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2262 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2263 }\
2264 \
2265 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2266 uint8_t full[SIZE*(SIZE+5)];\
2267 uint8_t * const full_mid= full + SIZE*2;\
2268 uint8_t halfH[SIZE*SIZE];\
2269 uint8_t halfV[SIZE*SIZE];\
2270 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2271 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2272 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2273 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2274 }\
2275 \
2276 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2277 uint8_t full[SIZE*(SIZE+5)];\
2278 uint8_t * const full_mid= full + SIZE*2;\
2279 uint8_t halfH[SIZE*SIZE];\
2280 uint8_t halfV[SIZE*SIZE];\
2281 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2282 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2283 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2284 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2285 }\
2286 \
2287 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2288 uint8_t full[SIZE*(SIZE+5)];\
2289 uint8_t * const full_mid= full + SIZE*2;\
2290 uint8_t halfH[SIZE*SIZE];\
2291 uint8_t halfV[SIZE*SIZE];\
2292 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2293 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2294 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2295 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2296 }\
2297 \
2298 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2299 uint8_t full[SIZE*(SIZE+5)];\
2300 uint8_t * const full_mid= full + SIZE*2;\
2301 uint8_t halfH[SIZE*SIZE];\
2302 uint8_t halfV[SIZE*SIZE];\
2303 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2304 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2305 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2306 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2307 }\
2308 \
2309 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2310 int16_t tmp[SIZE*(SIZE+5)];\
2311 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2312 }\
2313 \
2314 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2315 int16_t tmp[SIZE*(SIZE+5)];\
2316 uint8_t halfH[SIZE*SIZE];\
2317 uint8_t halfHV[SIZE*SIZE];\
2318 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2319 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2320 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2321 }\
2322 \
2323 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2324 int16_t tmp[SIZE*(SIZE+5)];\
2325 uint8_t halfH[SIZE*SIZE];\
2326 uint8_t halfHV[SIZE*SIZE];\
2327 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2328 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2329 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2330 }\
2331 \
2332 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2333 uint8_t full[SIZE*(SIZE+5)];\
2334 uint8_t * const full_mid= full + SIZE*2;\
2335 int16_t tmp[SIZE*(SIZE+5)];\
2336 uint8_t halfV[SIZE*SIZE];\
2337 uint8_t halfHV[SIZE*SIZE];\
2338 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2339 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2340 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2341 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2342 }\
2343 \
2344 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2345 uint8_t full[SIZE*(SIZE+5)];\
2346 uint8_t * const full_mid= full + SIZE*2;\
2347 int16_t tmp[SIZE*(SIZE+5)];\
2348 uint8_t halfV[SIZE*SIZE];\
2349 uint8_t halfHV[SIZE*SIZE];\
2350 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2351 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2352 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2353 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2354 }\
2355
2356 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2357 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2358 #define op_put(a, b) a = cm[((b) + 16)>>5]
2359 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2360 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2361
2362 H264_LOWPASS(put_ , op_put, op2_put)
2363 H264_LOWPASS(avg_ , op_avg, op2_avg)
2364 H264_MC(put_, 4)
2365 H264_MC(put_, 8)
2366 H264_MC(put_, 16)
2367 H264_MC(avg_, 4)
2368 H264_MC(avg_, 8)
2369 H264_MC(avg_, 16)
2370
2371 #undef op_avg
2372 #undef op_put
2373 #undef op2_avg
2374 #undef op2_put
2375 #endif
2376
2377 static inline uint8_t clip1(int x){
2378 if(x > 255) return 255;
2379 if(x < 0) return 0;
2380 return x;
2381 }
2382 #define op_scale1(x) block[x] = clip1( (block[x]*weight + offset) >> log2_denom )
2383 #define op_scale2(x) dst[x] = clip( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1), 0, 255 )
2384 #define H264_WEIGHT(W,H) \
2385 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2386 int x, y; \
2387 offset <<= log2_denom; \
2388 if(log2_denom) offset += 1<<(log2_denom-1); \
2389 for(y=0; y<H; y++, block += stride){ \
2390 op_scale1(0); \
2391 op_scale1(1); \
2392 if(W==2) continue; \
2393 op_scale1(2); \
2394 op_scale1(3); \
2395 if(W==4) continue; \
2396 op_scale1(4); \
2397 op_scale1(5); \
2398 op_scale1(6); \
2399 op_scale1(7); \
2400 if(W==8) continue; \
2401 op_scale1(8); \
2402 op_scale1(9); \
2403 op_scale1(10); \
2404 op_scale1(11); \
2405 op_scale1(12); \
2406 op_scale1(13); \
2407 op_scale1(14); \
2408 op_scale1(15); \
2409 } \
2410 } \
2411 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2412 int x, y; \
2413 int offset = (offsets + offsetd + 1) >> 1; \
2414 offset = ((offset << 1) + 1) << log2_denom; \
2415 for(y=0; y<H; y++, dst += stride, src += stride){ \
2416 op_scale2(0); \
2417 op_scale2(1); \
2418 if(W==2) continue; \
2419 op_scale2(2); \
2420 op_scale2(3); \
2421 if(W==4) continue; \
2422 op_scale2(4); \
2423 op_scale2(5); \
2424 op_scale2(6); \
2425 op_scale2(7); \
2426 if(W==8) continue; \
2427 op_scale2(8); \
2428 op_scale2(9); \
2429 op_scale2(10); \
2430 op_scale2(11); \
2431 op_scale2(12); \
2432 op_scale2(13); \
2433 op_scale2(14); \
2434 op_scale2(15); \
2435 } \
2436 }
2437
2438 H264_WEIGHT(16,16)
2439 H264_WEIGHT(16,8)
2440 H264_WEIGHT(8,16)
2441 H264_WEIGHT(8,8)
2442 H264_WEIGHT(8,4)
2443 H264_WEIGHT(4,8)
2444 H264_WEIGHT(4,4)
2445 H264_WEIGHT(4,2)
2446 H264_WEIGHT(2,4)
2447 H264_WEIGHT(2,2)
2448
2449 #undef op_scale1
2450 #undef op_scale2
2451 #undef H264_WEIGHT
2452
2453 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2454 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2455 int i;
2456
2457 for(i=0; i<h; i++){
2458 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2459 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2460 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2461 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2462 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2463 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2464 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2465 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2466 dst+=dstStride;
2467 src+=srcStride;
2468 }
2469 }
2470
2471 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2472 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2473 int i;
2474
2475 for(i=0; i<w; i++){
2476 const int src_1= src[ -srcStride];
2477 const int src0 = src[0 ];
2478 const int src1 = src[ srcStride];
2479 const int src2 = src[2*srcStride];
2480 const int src3 = src[3*srcStride];
2481 const int src4 = src[4*srcStride];
2482 const int src5 = src[5*srcStride];
2483 const int src6 = src[6*srcStride];
2484 const int src7 = src[7*srcStride];
2485 const int src8 = src[8*srcStride];
2486 const int src9 = src[9*srcStride];
2487 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2488 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2489 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2490 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2491 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2492 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2493 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2494 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2495 src++;
2496 dst++;
2497 }
2498 }
2499
2500 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2501 put_pixels8_c(dst, src, stride, 8);
2502 }
2503
2504 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2505 uint8_t half[64];
2506 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2507 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2508 }
2509
2510 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2511 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2512 }
2513
2514 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2515 uint8_t half[64];
2516 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2517 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2518 }
2519
2520 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2521 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2522 }
2523
2524 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2525 uint8_t halfH[88];
2526 uint8_t halfV[64];
2527 uint8_t halfHV[64];
2528 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2529 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2530 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2531 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2532 }
2533 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2534 uint8_t halfH[88];
2535 uint8_t halfV[64];
2536 uint8_t halfHV[64];
2537 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2538 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2539 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2540 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2541 }
2542 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2543 uint8_t halfH[88];
2544 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2545 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2546 }
2547
2548 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2549 int x;
2550 const int strength= ff_h263_loop_filter_strength[qscale];
2551
2552 for(x=0; x<8; x++){
2553 int d1, d2, ad1;
2554 int p0= src[x-2*stride];
2555 int p1= src[x-1*stride];
2556 int p2= src[x+0*stride];
2557 int p3= src[x+1*stride];
2558 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2559
2560 if (d<-2*strength) d1= 0;
2561 else if(d<- strength) d1=-2*strength - d;
2562 else if(d< strength) d1= d;
2563 else if(d< 2*strength) d1= 2*strength - d;
2564 else d1= 0;
2565
2566 p1 += d1;
2567 p2 -= d1;
2568 if(p1&256) p1= ~(p1>>31);
2569 if(p2&256) p2= ~(p2>>31);
2570
2571 src[x-1*stride] = p1;
2572 src[x+0*stride] = p2;
2573
2574 ad1= ABS(d1)>>1;
2575
2576 d2= clip((p0-p3)/4, -ad1, ad1);
2577
2578 src[x-2*stride] = p0 - d2;
2579 src[x+ stride] = p3 + d2;
2580 }
2581 }
2582
2583 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2584 int y;
2585 const int strength= ff_h263_loop_filter_strength[qscale];
2586
2587 for(y=0; y<8; y++){
2588 int d1, d2, ad1;
2589 int p0= src[y*stride-2];
2590 int p1= src[y*stride-1];
2591 int p2= src[y*stride+0];
2592 int p3= src[y*stride+1];
2593 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2594
2595 if (d<-2*strength) d1= 0;
2596 else if(d<- strength) d1=-2*strength - d;
2597 else if(d< strength) d1= d;
2598 else if(d< 2*strength) d1= 2*strength - d;
2599 else d1= 0;
2600
2601 p1 += d1;
2602 p2 -= d1;
2603 if(p1&256) p1= ~(p1>>31);
2604 if(p2&256) p2= ~(p2>>31);
2605
2606 src[y*stride-1] = p1;
2607 src[y*stride+0] = p2;
2608
2609 ad1= ABS(d1)>>1;
2610
2611 d2= clip((p0-p3)/4, -ad1, ad1);
2612
2613 src[y*stride-2] = p0 - d2;
2614 src[y*stride+1] = p3 + d2;
2615 }
2616 }
2617
2618 static void h261_loop_filter_c(uint8_t *src, int stride){
2619 int x,y,xy,yz;
2620 int temp[64];
2621
2622 for(x=0; x<8; x++){
2623 temp[x ] = 4*src[x ];
2624 temp[x + 7*8] = 4*src[x + 7*stride];
2625 }
2626 for(y=1; y<7; y++){
2627 for(x=0; x<8; x++){
2628 xy = y * stride + x;
2629 yz = y * 8 + x;
2630 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2631 }
2632 }
2633
2634 for(y=0; y<8; y++){
2635 src[ y*stride] = (temp[ y*8] + 2)>>2;
2636 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2637 for(x=1; x<7; x++){
2638 xy = y * stride + x;
2639 yz = y * 8 + x;
2640 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2641 }
2642 }
2643 }
2644
2645 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2646 {
2647 int s, i;
2648
2649 s = 0;
2650 for(i=0;i<h;i++) {
2651 s += abs(pix1[0] - pix2[0]);
2652 s += abs(pix1[1] - pix2[1]);
2653 s += abs(pix1[2] - pix2[2]);
2654 s += abs(pix1[3] - pix2[3]);
2655 s += abs(pix1[4] - pix2[4]);
2656 s += abs(pix1[5] - pix2[5]);
2657 s += abs(pix1[6] - pix2[6]);
2658 s += abs(pix1[7] - pix2[7]);
2659 s += abs(pix1[8] - pix2[8]);
2660 s += abs(pix1[9] - pix2[9]);
2661 s += abs(pix1[10] - pix2[10]);
2662 s += abs(pix1[11] - pix2[11]);
2663 s += abs(pix1[12] - pix2[12]);
2664 s += abs(pix1[13] - pix2[13]);
2665 s += abs(pix1[14] - pix2[14]);
2666 s += abs(pix1[15] - pix2[15]);
2667 pix1 += line_size;
2668 pix2 += line_size;
2669 }
2670 return s;
2671 }
2672
2673 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2674 {
2675 int s, i;
2676
2677 s = 0;
2678 for(i=0;i<h;i++) {
2679 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2680 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2681 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2682 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2683 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2684 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2685 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2686 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2687 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2688 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2689 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2690 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2691 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2692 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2693 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2694 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2695 pix1 += line_size;
2696 pix2 += line_size;
2697 }
2698 return s;
2699 }
2700
2701 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2702 {
2703 int s, i;
2704 uint8_t *pix3 = pix2 + line_size;
2705
2706 s = 0;
2707 for(i=0;i<h;i++) {
2708 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2709 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2710 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2711 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2712 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2713 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2714 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2715 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2716 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2717 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2718 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2719 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2720 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2721 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2722 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2723 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2724 pix1 += line_size;
2725 pix2 += line_size;
2726 pix3 += line_size;
2727 }
2728 return s;
2729 }
2730
2731 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2732 {
2733 int s, i;
2734 uint8_t *pix3 = pix2 + line_size;
2735
2736 s = 0;
2737 for(i=0;i<h;i++) {
2738 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2739 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2740 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2741 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2742 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2743 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2744 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2745 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2746 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2747 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2748 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2749 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2750 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2751 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2752 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2753 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2754 pix1 += line_size;
2755 pix2 += line_size;
2756 pix3 += line_size;
2757 }
2758 return s;
2759 }
2760
2761 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2762 {
2763 int s, i;
2764
2765 s = 0;
2766 for(i=0;i<h;i++) {
2767 s += abs(pix1[0] - pix2[0]);
2768 s += abs(pix1[1] - pix2[1]);
2769 s += abs(pix1[2] - pix2[2]);
2770 s += abs(pix1[3] - pix2[3]);
2771 s += abs(pix1[4] - pix2[4]);
2772 s += abs(pix1[5] - pix2[5]);
2773 s += abs(pix1[6] - pix2[6]);
2774 s += abs(pix1[7] - pix2[7]);
2775 pix1 += line_size;
2776 pix2 += line_size;
2777 }
2778 return s;
2779 }
2780
2781 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2782 {
2783 int s, i;
2784
2785 s = 0;
2786 for(i=0;i<h;i++) {
2787 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2788 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2789 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2790 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2791 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2792 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2793 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2794 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2795 pix1 += line_size;
2796 pix2 += line_size;
2797 }
2798 return s;
2799 }
2800
2801 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2802 {
2803 int s, i;
2804 uint8_t *pix3 = pix2 + line_size;
2805
2806 s = 0;
2807 for(i=0;i<h;i++) {
2808 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2809 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2810 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2811 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2812 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2813 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2814 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2815 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2816 pix1 += line_size;
2817 pix2 += line_size;
2818 pix3 += line_size;
2819 }
2820 return s;
2821 }
2822
2823 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2824 {
2825 int s, i;
2826 uint8_t *pix3 = pix2 + line_size;
2827
2828 s = 0;
2829 for(i=0;i<h;i++) {
2830 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2831 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2832 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2833 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2834 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2835 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2836 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2837 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2838 pix1 += line_size;
2839 pix2 += line_size;
2840 pix3 += line_size;
2841 }
2842 return s;
2843 }
2844
2845 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2846 int score1=0;
2847 int score2=0;
2848 int x,y;
2849
2850 for(y=0; y<h; y++){
2851 for(x=0; x<16; x++){
2852 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2853 }
2854 if(y+1<h){
2855 for(x=0; x<15; x++){
2856 score2+= ABS( s1[x ] - s1[x +stride]
2857 - s1[x+1] + s1[x+1+stride])
2858 -ABS( s2[x ] - s2[x +stride]
2859 - s2[x+1] + s2[x+1+stride]);
2860 }
2861 }
2862 s1+= stride;
2863 s2+= stride;
2864 }
2865
2866 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2867 else return score1 + ABS(score2)*8;
2868 }
2869
2870 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2871 int score1=0;
2872 int score2=0;
2873 int x,y;
2874
2875 for(y=0; y<h; y++){
2876 for(x=0; x<8; x++){
2877 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2878 }
2879 if(y+1<h){
2880 for(x=0; x<7; x++){
2881 score2+= ABS( s1[x ] - s1[x +stride]
2882 - s1[x+1] + s1[x+1+stride])
2883 -ABS( s2[x ] - s2[x +stride]
2884 - s2[x+1] + s2[x+1+stride]);
2885 }
2886 }
2887 s1+= stride;
2888 s2+= stride;
2889 }
2890
2891 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2892 else return score1 + ABS(score2)*8;
2893 }
2894
2895 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2896 int i;
2897 unsigned int sum=0;
2898
2899 for(i=0; i<8*8; i++){
2900 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2901 int w= weight[i];
2902 b>>= RECON_SHIFT;
2903 assert(-512<b && b<512);
2904
2905 sum += (w*b)*(w*b)>>4;
2906 }
2907 return sum>>2;
2908 }
2909
2910 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2911 int i;
2912
2913 for(i=0; i<8*8; i++){
2914 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2915 }
2916 }
2917
2918 /**
2919 * permutes an 8x8 block.
2920 * @param block the block which will be permuted according to the given permutation vector
2921 * @param permutation the permutation vector
2922 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2923 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2924 * (inverse) permutated to scantable order!
2925 */
2926 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2927 {
2928 int i;
2929 DCTELEM temp[64];
2930
2931 if(last<=0) return;
2932 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2933
2934 for(i=0; i<=last; i++){
2935 const int j= scantable[i];
2936 temp[j]= block[j];
2937 block[j]=0;
2938 }
2939
2940 for(i=0; i<=last; i++){
2941 const int j= scantable[i];
2942 const int perm_j= permutation[j];
2943 block[perm_j]= temp[j];
2944 }
2945 }
2946
2947 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2948 return 0;
2949 }
2950
2951 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2952 int i;
2953
2954 memset(cmp, 0, sizeof(void*)*5);
2955
2956 for(i=0; i<5; i++){
2957 switch(type&0xFF){
2958 case FF_CMP_SAD:
2959 cmp[i]= c->sad[i];
2960 break;
2961 case FF_CMP_SATD:
2962 cmp[i]= c->hadamard8_diff[i];
2963 break;
2964 case FF_CMP_SSE:
2965 cmp[i]= c->sse[i];
2966 break;
2967 case FF_CMP_DCT:
2968 cmp[i]= c->dct_sad[i];
2969 break;
2970 case FF_CMP_DCTMAX:
2971 cmp[i]= c->dct_max[i];
2972 break;
2973 case FF_CMP_PSNR:
2974 cmp[i]= c->quant_psnr[i];
2975 break;
2976 case FF_CMP_BIT:
2977 cmp[i]= c->bit[i];
2978 break;
2979 case FF_CMP_RD:
2980 cmp[i]= c->rd[i];
2981 break;
2982 case FF_CMP_VSAD:
2983 cmp[i]= c->vsad[i];
2984 break;
2985 case FF_CMP_VSSE:
2986 cmp[i]= c->vsse[i];
2987 break;
2988 case FF_CMP_ZERO:
2989 cmp[i]= zero_cmp;
2990 break;
2991 case FF_CMP_NSSE:
2992 cmp[i]= c->nsse[i];
2993 break;
2994 case FF_CMP_W53:
2995 cmp[i]= c->w53[i];
2996 break;
2997 case FF_CMP_W97:
2998 cmp[i]= c->w97[i];
2999 break;
3000 default:
3001 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3002 }
3003 }
3004 }
3005
3006 /**
3007 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3008 */
3009 static void clear_blocks_c(DCTELEM *blocks)
3010 {
3011 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3012 }
3013
3014 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3015 int i;
3016 for(i=0; i+7<w; i+=8){
3017 dst[i+0] += src[i+0];
3018 dst[i+1] += src[i+1];
3019 dst[i+2] += src[i+2];
3020 dst[i+3] += src[i+3];
3021 dst[i+4] += src[i+4];
3022 dst[i+5] += src[i+5];
3023 dst[i+6] += src[i+6];
3024 dst[i+7] += src[i+7];
3025 }
3026 for(; i<w; i++)
3027 dst[i+0] += src[i+0];
3028 }
3029
3030 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3031 int i;
3032 for(i=0; i+7<w; i+=8){
3033 dst[i+0] = src1[i+0]-src2[i+0];
3034 dst[i+1] = src1[i+1]-src2[i+1];
3035 dst[i+2] = src1[i+2]-src2[i+2];
3036 dst[i+3] = src1[i+3]-src2[i+3];
3037 dst[i+4] = src1[i+4]-src2[i+4];
3038 dst[i+5] = src1[i+5]-src2[i+5];
3039 dst[i+6] = src1[i+6]-src2[i+6];
3040 dst[i+7] = src1[i+7]-src2[i+7];
3041 }