1/2 resolution decoding
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
32 #include "faandct.h"
33
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
35 uint32_t squareTbl[512] = {0, };
36
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46 };
47
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59 };
60
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
63
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73 };
74
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84 };
85
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120 };
121
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 };
133
134 static int pix_sum_c(uint8_t * pix, int line_size)
135 {
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154 }
155
156 static int pix_norm1_c(uint8_t * pix, int line_size)
157 {
158 int s, i, j;
159 uint32_t *sq = squareTbl + 256;
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
164 #if 0
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
173 #else
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184 #else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195 #endif
196 #endif
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202 }
203
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220 }
221
222 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223 {
224 int s, i;
225 uint32_t *sq = squareTbl + 256;
226
227 s = 0;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 pix1 += line_size;
234 pix2 += line_size;
235 }
236 return s;
237 }
238
239 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
240 {
241 int s, i;
242 uint32_t *sq = squareTbl + 256;
243
244 s = 0;
245 for (i = 0; i < h; i++) {
246 s += sq[pix1[0] - pix2[0]];
247 s += sq[pix1[1] - pix2[1]];
248 s += sq[pix1[2] - pix2[2]];
249 s += sq[pix1[3] - pix2[3]];
250 s += sq[pix1[4] - pix2[4]];
251 s += sq[pix1[5] - pix2[5]];
252 s += sq[pix1[6] - pix2[6]];
253 s += sq[pix1[7] - pix2[7]];
254 pix1 += line_size;
255 pix2 += line_size;
256 }
257 return s;
258 }
259
260 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
261 {
262 int s, i;
263 uint32_t *sq = squareTbl + 256;
264
265 s = 0;
266 for (i = 0; i < h; i++) {
267 s += sq[pix1[ 0] - pix2[ 0]];
268 s += sq[pix1[ 1] - pix2[ 1]];
269 s += sq[pix1[ 2] - pix2[ 2]];
270 s += sq[pix1[ 3] - pix2[ 3]];
271 s += sq[pix1[ 4] - pix2[ 4]];
272 s += sq[pix1[ 5] - pix2[ 5]];
273 s += sq[pix1[ 6] - pix2[ 6]];
274 s += sq[pix1[ 7] - pix2[ 7]];
275 s += sq[pix1[ 8] - pix2[ 8]];
276 s += sq[pix1[ 9] - pix2[ 9]];
277 s += sq[pix1[10] - pix2[10]];
278 s += sq[pix1[11] - pix2[11]];
279 s += sq[pix1[12] - pix2[12]];
280 s += sq[pix1[13] - pix2[13]];
281 s += sq[pix1[14] - pix2[14]];
282 s += sq[pix1[15] - pix2[15]];
283
284 pix1 += line_size;
285 pix2 += line_size;
286 }
287 return s;
288 }
289
290
291 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
292 int s, i, j;
293 const int dec_count= w==8 ? 3 : 4;
294 int tmp[16*16];
295 #if 0
296 int level, ori;
297 static const int scale[2][2][4][4]={
298 {
299 {
300 //8x8 dec=3
301 {268, 239, 239, 213},
302 { 0, 224, 224, 152},
303 { 0, 135, 135, 110},
304 },{
305 //16x16 dec=4
306 {344, 310, 310, 280},
307 { 0, 320, 320, 228},
308 { 0, 175, 175, 136},
309 { 0, 129, 129, 102},
310 }
311 },{
312 {//FIXME 5/3
313 //8x8 dec=3
314 {275, 245, 245, 218},
315 { 0, 230, 230, 156},
316 { 0, 138, 138, 113},
317 },{
318 //16x16 dec=4
319 {352, 317, 317, 286},
320 { 0, 328, 328, 233},
321 { 0, 180, 180, 140},
322 { 0, 132, 132, 105},
323 }
324 }
325 };
326 #endif
327
328 for (i = 0; i < h; i++) {
329 for (j = 0; j < w; j+=4) {
330 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
331 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
332 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
333 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
334 }
335 pix1 += line_size;
336 pix2 += line_size;
337 }
338 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
339
340 s=0;
341 #if 0
342 for(level=0; level<dec_count; level++){
343 for(ori= level ? 1 : 0; ori<4; ori++){
344 int sx= (ori&1) ? 1<<level: 0;
345 int stride= 16<<(dec_count-level);
346 int sy= (ori&2) ? stride>>1 : 0;
347 int size= 1<<level;
348
349 for(i=0; i<size; i++){
350 for(j=0; j<size; j++){
351 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
352 s += ABS(v);
353 }
354 }
355 }
356 }
357 #endif
358 for (i = 0; i < h; i++) {
359 for (j = 0; j < w; j+=4) {
360 s+= ABS(tmp[16*i+j+0]);
361 s+= ABS(tmp[16*i+j+1]);
362 s+= ABS(tmp[16*i+j+2]);
363 s+= ABS(tmp[16*i+j+3]);
364 }
365 }
366 assert(s>=0);
367
368 return s>>2;
369 }
370
371 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372 return w_c(v, pix1, pix2, line_size, 8, h, 1);
373 }
374
375 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376 return w_c(v, pix1, pix2, line_size, 8, h, 0);
377 }
378
379 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380 return w_c(v, pix1, pix2, line_size, 16, h, 1);
381 }
382
383 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384 return w_c(v, pix1, pix2, line_size, 16, h, 0);
385 }
386
387 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
388 {
389 int i;
390
391 /* read the pixels */
392 for(i=0;i<8;i++) {
393 block[0] = pixels[0];
394 block[1] = pixels[1];
395 block[2] = pixels[2];
396 block[3] = pixels[3];
397 block[4] = pixels[4];
398 block[5] = pixels[5];
399 block[6] = pixels[6];
400 block[7] = pixels[7];
401 pixels += line_size;
402 block += 8;
403 }
404 }
405
406 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
407 const uint8_t *s2, int stride){
408 int i;
409
410 /* read the pixels */
411 for(i=0;i<8;i++) {
412 block[0] = s1[0] - s2[0];
413 block[1] = s1[1] - s2[1];
414 block[2] = s1[2] - s2[2];
415 block[3] = s1[3] - s2[3];
416 block[4] = s1[4] - s2[4];
417 block[5] = s1[5] - s2[5];
418 block[6] = s1[6] - s2[6];
419 block[7] = s1[7] - s2[7];
420 s1 += stride;
421 s2 += stride;
422 block += 8;
423 }
424 }
425
426
427 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
428 int line_size)
429 {
430 int i;
431 uint8_t *cm = cropTbl + MAX_NEG_CROP;
432
433 /* read the pixels */
434 for(i=0;i<8;i++) {
435 pixels[0] = cm[block[0]];
436 pixels[1] = cm[block[1]];
437 pixels[2] = cm[block[2]];
438 pixels[3] = cm[block[3]];
439 pixels[4] = cm[block[4]];
440 pixels[5] = cm[block[5]];
441 pixels[6] = cm[block[6]];
442 pixels[7] = cm[block[7]];
443
444 pixels += line_size;
445 block += 8;
446 }
447 }
448
449 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
450 int line_size)
451 {
452 int i;
453 uint8_t *cm = cropTbl + MAX_NEG_CROP;
454
455 /* read the pixels */
456 for(i=0;i<4;i++) {
457 pixels[0] = cm[block[0]];
458 pixels[1] = cm[block[1]];
459 pixels[2] = cm[block[2]];
460 pixels[3] = cm[block[3]];
461
462 pixels += line_size;
463 block += 8;
464 }
465 }
466
467 static void put_signed_pixels_clamped_c(const DCTELEM *block,
468 uint8_t *restrict pixels,
469 int line_size)
470 {
471 int i, j;
472
473 for (i = 0; i < 8; i++) {
474 for (j = 0; j < 8; j++) {
475 if (*block < -128)
476 *pixels = 0;
477 else if (*block > 127)
478 *pixels = 255;
479 else
480 *pixels = (uint8_t)(*block + 128);
481 block++;
482 pixels++;
483 }
484 pixels += (line_size - 8);
485 }
486 }
487
488 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
489 int line_size)
490 {
491 int i;
492 uint8_t *cm = cropTbl + MAX_NEG_CROP;
493
494 /* read the pixels */
495 for(i=0;i<8;i++) {
496 pixels[0] = cm[pixels[0] + block[0]];
497 pixels[1] = cm[pixels[1] + block[1]];
498 pixels[2] = cm[pixels[2] + block[2]];
499 pixels[3] = cm[pixels[3] + block[3]];
500 pixels[4] = cm[pixels[4] + block[4]];
501 pixels[5] = cm[pixels[5] + block[5]];
502 pixels[6] = cm[pixels[6] + block[6]];
503 pixels[7] = cm[pixels[7] + block[7]];
504 pixels += line_size;
505 block += 8;
506 }
507 }
508
509 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
510 int line_size)
511 {
512 int i;
513 uint8_t *cm = cropTbl + MAX_NEG_CROP;
514
515 /* read the pixels */
516 for(i=0;i<4;i++) {
517 pixels[0] = cm[pixels[0] + block[0]];
518 pixels[1] = cm[pixels[1] + block[1]];
519 pixels[2] = cm[pixels[2] + block[2]];
520 pixels[3] = cm[pixels[3] + block[3]];
521 pixels += line_size;
522 block += 8;
523 }
524 }
525 #if 0
526
527 #define PIXOP2(OPNAME, OP) \
528 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
529 {\
530 int i;\
531 for(i=0; i<h; i++){\
532 OP(*((uint64_t*)block), LD64(pixels));\
533 pixels+=line_size;\
534 block +=line_size;\
535 }\
536 }\
537 \
538 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
539 {\
540 int i;\
541 for(i=0; i<h; i++){\
542 const uint64_t a= LD64(pixels );\
543 const uint64_t b= LD64(pixels+1);\
544 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
545 pixels+=line_size;\
546 block +=line_size;\
547 }\
548 }\
549 \
550 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
551 {\
552 int i;\
553 for(i=0; i<h; i++){\
554 const uint64_t a= LD64(pixels );\
555 const uint64_t b= LD64(pixels+1);\
556 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
557 pixels+=line_size;\
558 block +=line_size;\
559 }\
560 }\
561 \
562 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
563 {\
564 int i;\
565 for(i=0; i<h; i++){\
566 const uint64_t a= LD64(pixels );\
567 const uint64_t b= LD64(pixels+line_size);\
568 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
569 pixels+=line_size;\
570 block +=line_size;\
571 }\
572 }\
573 \
574 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
575 {\
576 int i;\
577 for(i=0; i<h; i++){\
578 const uint64_t a= LD64(pixels );\
579 const uint64_t b= LD64(pixels+line_size);\
580 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
581 pixels+=line_size;\
582 block +=line_size;\
583 }\
584 }\
585 \
586 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
587 {\
588 int i;\
589 const uint64_t a= LD64(pixels );\
590 const uint64_t b= LD64(pixels+1);\
591 uint64_t l0= (a&0x0303030303030303ULL)\
592 + (b&0x0303030303030303ULL)\
593 + 0x0202020202020202ULL;\
594 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
595 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
596 uint64_t l1,h1;\
597 \
598 pixels+=line_size;\
599 for(i=0; i<h; i+=2){\
600 uint64_t a= LD64(pixels );\
601 uint64_t b= LD64(pixels+1);\
602 l1= (a&0x0303030303030303ULL)\
603 + (b&0x0303030303030303ULL);\
604 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
605 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
606 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
607 pixels+=line_size;\
608 block +=line_size;\
609 a= LD64(pixels );\
610 b= LD64(pixels+1);\
611 l0= (a&0x0303030303030303ULL)\
612 + (b&0x0303030303030303ULL)\
613 + 0x0202020202020202ULL;\
614 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
615 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
616 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
617 pixels+=line_size;\
618 block +=line_size;\
619 }\
620 }\
621 \
622 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
623 {\
624 int i;\
625 const uint64_t a= LD64(pixels );\
626 const uint64_t b= LD64(pixels+1);\
627 uint64_t l0= (a&0x0303030303030303ULL)\
628 + (b&0x0303030303030303ULL)\
629 + 0x0101010101010101ULL;\
630 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
631 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
632 uint64_t l1,h1;\
633 \
634 pixels+=line_size;\
635 for(i=0; i<h; i+=2){\
636 uint64_t a= LD64(pixels );\
637 uint64_t b= LD64(pixels+1);\
638 l1= (a&0x0303030303030303ULL)\
639 + (b&0x0303030303030303ULL);\
640 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
641 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
642 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
643 pixels+=line_size;\
644 block +=line_size;\
645 a= LD64(pixels );\
646 b= LD64(pixels+1);\
647 l0= (a&0x0303030303030303ULL)\
648 + (b&0x0303030303030303ULL)\
649 + 0x0101010101010101ULL;\
650 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
651 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
652 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
653 pixels+=line_size;\
654 block +=line_size;\
655 }\
656 }\
657 \
658 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
659 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
660 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
661 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
662 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
663 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
664 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
665
666 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
667 #else // 64 bit variant
668
669 #define PIXOP2(OPNAME, OP) \
670 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
671 int i;\
672 for(i=0; i<h; i++){\
673 OP(*((uint16_t*)(block )), LD16(pixels ));\
674 pixels+=line_size;\
675 block +=line_size;\
676 }\
677 }\
678 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
679 int i;\
680 for(i=0; i<h; i++){\
681 OP(*((uint32_t*)(block )), LD32(pixels ));\
682 pixels+=line_size;\
683 block +=line_size;\
684 }\
685 }\
686 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
687 int i;\
688 for(i=0; i<h; i++){\
689 OP(*((uint32_t*)(block )), LD32(pixels ));\
690 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
691 pixels+=line_size;\
692 block +=line_size;\
693 }\
694 }\
695 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
696 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
697 }\
698 \
699 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
700 int src_stride1, int src_stride2, int h){\
701 int i;\
702 for(i=0; i<h; i++){\
703 uint32_t a,b;\
704 a= LD32(&src1[i*src_stride1 ]);\
705 b= LD32(&src2[i*src_stride2 ]);\
706 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
707 a= LD32(&src1[i*src_stride1+4]);\
708 b= LD32(&src2[i*src_stride2+4]);\
709 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
710 }\
711 }\
712 \
713 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
714 int src_stride1, int src_stride2, int h){\
715 int i;\
716 for(i=0; i<h; i++){\
717 uint32_t a,b;\
718 a= LD32(&src1[i*src_stride1 ]);\
719 b= LD32(&src2[i*src_stride2 ]);\
720 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
721 a= LD32(&src1[i*src_stride1+4]);\
722 b= LD32(&src2[i*src_stride2+4]);\
723 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
724 }\
725 }\
726 \
727 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
728 int src_stride1, int src_stride2, int h){\
729 int i;\
730 for(i=0; i<h; i++){\
731 uint32_t a,b;\
732 a= LD32(&src1[i*src_stride1 ]);\
733 b= LD32(&src2[i*src_stride2 ]);\
734 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
735 }\
736 }\
737 \
738 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
739 int src_stride1, int src_stride2, int h){\
740 int i;\
741 for(i=0; i<h; i++){\
742 uint32_t a,b;\
743 a= LD16(&src1[i*src_stride1 ]);\
744 b= LD16(&src2[i*src_stride2 ]);\
745 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
746 }\
747 }\
748 \
749 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
750 int src_stride1, int src_stride2, int h){\
751 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
752 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
753 }\
754 \
755 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
756 int src_stride1, int src_stride2, int h){\
757 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
758 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
759 }\
760 \
761 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
762 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
763 }\
764 \
765 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
766 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
767 }\
768 \
769 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
770 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
771 }\
772 \
773 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
774 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
775 }\
776 \
777 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
778 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
779 int i;\
780 for(i=0; i<h; i++){\
781 uint32_t a, b, c, d, l0, l1, h0, h1;\
782 a= LD32(&src1[i*src_stride1]);\
783 b= LD32(&src2[i*src_stride2]);\
784 c= LD32(&src3[i*src_stride3]);\
785 d= LD32(&src4[i*src_stride4]);\
786 l0= (a&0x03030303UL)\
787 + (b&0x03030303UL)\
788 + 0x02020202UL;\
789 h0= ((a&0xFCFCFCFCUL)>>2)\
790 + ((b&0xFCFCFCFCUL)>>2);\
791 l1= (c&0x03030303UL)\
792 + (d&0x03030303UL);\
793 h1= ((c&0xFCFCFCFCUL)>>2)\
794 + ((d&0xFCFCFCFCUL)>>2);\
795 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
796 a= LD32(&src1[i*src_stride1+4]);\
797 b= LD32(&src2[i*src_stride2+4]);\
798 c= LD32(&src3[i*src_stride3+4]);\
799 d= LD32(&src4[i*src_stride4+4]);\
800 l0= (a&0x03030303UL)\
801 + (b&0x03030303UL)\
802 + 0x02020202UL;\
803 h0= ((a&0xFCFCFCFCUL)>>2)\
804 + ((b&0xFCFCFCFCUL)>>2);\
805 l1= (c&0x03030303UL)\
806 + (d&0x03030303UL);\
807 h1= ((c&0xFCFCFCFCUL)>>2)\
808 + ((d&0xFCFCFCFCUL)>>2);\
809 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
810 }\
811 }\
812 \
813 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
814 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
815 }\
816 \
817 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
818 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
819 }\
820 \
821 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
822 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
823 }\
824 \
825 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
826 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
827 }\
828 \
829 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
830 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
831 int i;\
832 for(i=0; i<h; i++){\
833 uint32_t a, b, c, d, l0, l1, h0, h1;\
834 a= LD32(&src1[i*src_stride1]);\
835 b= LD32(&src2[i*src_stride2]);\
836 c= LD32(&src3[i*src_stride3]);\
837 d= LD32(&src4[i*src_stride4]);\
838 l0= (a&0x03030303UL)\
839 + (b&0x03030303UL)\
840 + 0x01010101UL;\
841 h0= ((a&0xFCFCFCFCUL)>>2)\
842 + ((b&0xFCFCFCFCUL)>>2);\
843 l1= (c&0x03030303UL)\
844 + (d&0x03030303UL);\
845 h1= ((c&0xFCFCFCFCUL)>>2)\
846 + ((d&0xFCFCFCFCUL)>>2);\
847 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
848 a= LD32(&src1[i*src_stride1+4]);\
849 b= LD32(&src2[i*src_stride2+4]);\
850 c= LD32(&src3[i*src_stride3+4]);\
851 d= LD32(&src4[i*src_stride4+4]);\
852 l0= (a&0x03030303UL)\
853 + (b&0x03030303UL)\
854 + 0x01010101UL;\
855 h0= ((a&0xFCFCFCFCUL)>>2)\
856 + ((b&0xFCFCFCFCUL)>>2);\
857 l1= (c&0x03030303UL)\
858 + (d&0x03030303UL);\
859 h1= ((c&0xFCFCFCFCUL)>>2)\
860 + ((d&0xFCFCFCFCUL)>>2);\
861 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
862 }\
863 }\
864 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
865 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
866 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
867 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
868 }\
869 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
870 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
871 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
872 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
873 }\
874 \
875 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
876 {\
877 int i, a0, b0, a1, b1;\
878 a0= pixels[0];\
879 b0= pixels[1] + 2;\
880 a0 += b0;\
881 b0 += pixels[2];\
882 \
883 pixels+=line_size;\
884 for(i=0; i<h; i+=2){\
885 a1= pixels[0];\
886 b1= pixels[1];\
887 a1 += b1;\
888 b1 += pixels[2];\
889 \
890 block[0]= (a1+a0)>>2; /* FIXME non put */\
891 block[1]= (b1+b0)>>2;\
892 \
893 pixels+=line_size;\
894 block +=line_size;\
895 \
896 a0= pixels[0];\
897 b0= pixels[1] + 2;\
898 a0 += b0;\
899 b0 += pixels[2];\
900 \
901 block[0]= (a1+a0)>>2;\
902 block[1]= (b1+b0)>>2;\
903 pixels+=line_size;\
904 block +=line_size;\
905 }\
906 }\
907 \
908 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
909 {\
910 int i;\
911 const uint32_t a= LD32(pixels );\
912 const uint32_t b= LD32(pixels+1);\
913 uint32_t l0= (a&0x03030303UL)\
914 + (b&0x03030303UL)\
915 + 0x02020202UL;\
916 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
917 + ((b&0xFCFCFCFCUL)>>2);\
918 uint32_t l1,h1;\
919 \
920 pixels+=line_size;\
921 for(i=0; i<h; i+=2){\
922 uint32_t a= LD32(pixels );\
923 uint32_t b= LD32(pixels+1);\
924 l1= (a&0x03030303UL)\
925 + (b&0x03030303UL);\
926 h1= ((a&0xFCFCFCFCUL)>>2)\
927 + ((b&0xFCFCFCFCUL)>>2);\
928 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
929 pixels+=line_size;\
930 block +=line_size;\
931 a= LD32(pixels );\
932 b= LD32(pixels+1);\
933 l0= (a&0x03030303UL)\
934 + (b&0x03030303UL)\
935 + 0x02020202UL;\
936 h0= ((a&0xFCFCFCFCUL)>>2)\
937 + ((b&0xFCFCFCFCUL)>>2);\
938 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
939 pixels+=line_size;\
940 block +=line_size;\
941 }\
942 }\
943 \
944 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
945 {\
946 int j;\
947 for(j=0; j<2; j++){\
948 int i;\
949 const uint32_t a= LD32(pixels );\
950 const uint32_t b= LD32(pixels+1);\
951 uint32_t l0= (a&0x03030303UL)\
952 + (b&0x03030303UL)\
953 + 0x02020202UL;\
954 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
955 + ((b&0xFCFCFCFCUL)>>2);\
956 uint32_t l1,h1;\
957 \
958 pixels+=line_size;\
959 for(i=0; i<h; i+=2){\
960 uint32_t a= LD32(pixels );\
961 uint32_t b= LD32(pixels+1);\
962 l1= (a&0x03030303UL)\
963 + (b&0x03030303UL);\
964 h1= ((a&0xFCFCFCFCUL)>>2)\
965 + ((b&0xFCFCFCFCUL)>>2);\
966 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
967 pixels+=line_size;\
968 block +=line_size;\
969 a= LD32(pixels );\
970 b= LD32(pixels+1);\
971 l0= (a&0x03030303UL)\
972 + (b&0x03030303UL)\
973 + 0x02020202UL;\
974 h0= ((a&0xFCFCFCFCUL)>>2)\
975 + ((b&0xFCFCFCFCUL)>>2);\
976 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
977 pixels+=line_size;\
978 block +=line_size;\
979 }\
980 pixels+=4-line_size*(h+1);\
981 block +=4-line_size*h;\
982 }\
983 }\
984 \
985 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
986 {\
987 int j;\
988 for(j=0; j<2; j++){\
989 int i;\
990 const uint32_t a= LD32(pixels );\
991 const uint32_t b= LD32(pixels+1);\
992 uint32_t l0= (a&0x03030303UL)\
993 + (b&0x03030303UL)\
994 + 0x01010101UL;\
995 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
997 uint32_t l1,h1;\
998 \
999 pixels+=line_size;\
1000 for(i=0; i<h; i+=2){\
1001 uint32_t a= LD32(pixels );\
1002 uint32_t b= LD32(pixels+1);\
1003 l1= (a&0x03030303UL)\
1004 + (b&0x03030303UL);\
1005 h1= ((a&0xFCFCFCFCUL)>>2)\
1006 + ((b&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008 pixels+=line_size;\
1009 block +=line_size;\
1010 a= LD32(pixels );\
1011 b= LD32(pixels+1);\
1012 l0= (a&0x03030303UL)\
1013 + (b&0x03030303UL)\
1014 + 0x01010101UL;\
1015 h0= ((a&0xFCFCFCFCUL)>>2)\
1016 + ((b&0xFCFCFCFCUL)>>2);\
1017 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018 pixels+=line_size;\
1019 block +=line_size;\
1020 }\
1021 pixels+=4-line_size*(h+1);\
1022 block +=4-line_size*h;\
1023 }\
1024 }\
1025 \
1026 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1027 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1028 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1029 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1030 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1031 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1032 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1033 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1034
1035 #define op_avg(a, b) a = rnd_avg32(a, b)
1036 #endif
1037 #define op_put(a, b) a = b
1038
1039 PIXOP2(avg, op_avg)
1040 PIXOP2(put, op_put)
1041 #undef op_avg
1042 #undef op_put
1043
1044 #define avg2(a,b) ((a+b+1)>>1)
1045 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1046
1047 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1048 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1049 }
1050
1051 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1052 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1053 }
1054
1055 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1056 {
1057 const int A=(16-x16)*(16-y16);
1058 const int B=( x16)*(16-y16);
1059 const int C=(16-x16)*( y16);
1060 const int D=( x16)*( y16);
1061 int i;
1062
1063 for(i=0; i<h; i++)
1064 {
1065 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1066 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1067 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1068 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1069 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1070 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1071 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1072 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1073 dst+= stride;
1074 src+= stride;
1075 }
1076 }
1077
1078 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1079 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1080 {
1081 int y, vx, vy;
1082 const int s= 1<<shift;
1083
1084 width--;
1085 height--;
1086
1087 for(y=0; y<h; y++){
1088 int x;
1089
1090 vx= ox;
1091 vy= oy;
1092 for(x=0; x<8; x++){ //XXX FIXME optimize
1093 int src_x, src_y, frac_x, frac_y, index;
1094
1095 src_x= vx>>16;
1096 src_y= vy>>16;
1097 frac_x= src_x&(s-1);
1098 frac_y= src_y&(s-1);
1099 src_x>>=shift;
1100 src_y>>=shift;
1101
1102 if((unsigned)src_x < width){
1103 if((unsigned)src_y < height){
1104 index= src_x + src_y*stride;
1105 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1106 + src[index +1]* frac_x )*(s-frac_y)
1107 + ( src[index+stride ]*(s-frac_x)
1108 + src[index+stride+1]* frac_x )* frac_y
1109 + r)>>(shift*2);
1110 }else{
1111 index= src_x + clip(src_y, 0, height)*stride;
1112 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1113 + src[index +1]* frac_x )*s
1114 + r)>>(shift*2);
1115 }
1116 }else{
1117 if((unsigned)src_y < height){
1118 index= clip(src_x, 0, width) + src_y*stride;
1119 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1120 + src[index+stride ]* frac_y )*s
1121 + r)>>(shift*2);
1122 }else{
1123 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1124 dst[y*stride + x]= src[index ];
1125 }
1126 }
1127
1128 vx+= dxx;
1129 vy+= dyx;
1130 }
1131 ox += dxy;
1132 oy += dyy;
1133 }
1134 }
1135
1136 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1137 switch(width){
1138 case 2: put_pixels2_c (dst, src, stride, height); break;
1139 case 4: put_pixels4_c (dst, src, stride, height); break;
1140 case 8: put_pixels8_c (dst, src, stride, height); break;
1141 case 16:put_pixels16_c(dst, src, stride, height); break;
1142 }
1143 }
1144
1145 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1146 int i,j;
1147 for (i=0; i < height; i++) {
1148 for (j=0; j < width; j++) {
1149 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1150 }
1151 src += stride;
1152 dst += stride;
1153 }
1154 }
1155
1156 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1157 int i,j;
1158 for (i=0; i < height; i++) {
1159 for (j=0; j < width; j++) {
1160 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1161 }
1162 src += stride;
1163 dst += stride;
1164 }
1165 }
1166
1167 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1168 int i,j;
1169 for (i=0; i < height; i++) {
1170 for (j=0; j < width; j++) {
1171 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1172 }
1173 src += stride;
1174 dst += stride;
1175 }
1176 }
1177
1178 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1179 int i,j;
1180 for (i=0; i < height; i++) {
1181 for (j=0; j < width; j++) {
1182 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1183 }
1184 src += stride;
1185 dst += stride;
1186 }
1187 }
1188
1189 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1190 int i,j;
1191 for (i=0; i < height; i++) {
1192 for (j=0; j < width; j++) {
1193 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1194 }
1195 src += stride;
1196 dst += stride;
1197 }
1198 }
1199
1200 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1201 int i,j;
1202 for (i=0; i < height; i++) {
1203 for (j=0; j < width; j++) {
1204 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1205 }
1206 src += stride;
1207 dst += stride;
1208 }
1209 }
1210
1211 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1212 int i,j;
1213 for (i=0; i < height; i++) {
1214 for (j=0; j < width; j++) {
1215 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1216 }
1217 src += stride;
1218 dst += stride;
1219 }
1220 }
1221
1222 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1223 int i,j;
1224 for (i=0; i < height; i++) {
1225 for (j=0; j < width; j++) {
1226 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1227 }
1228 src += stride;
1229 dst += stride;
1230 }
1231 }
1232
1233 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1234 switch(width){
1235 case 2: avg_pixels2_c (dst, src, stride, height); break;
1236 case 4: avg_pixels4_c (dst, src, stride, height); break;
1237 case 8: avg_pixels8_c (dst, src, stride, height); break;
1238 case 16:avg_pixels16_c(dst, src, stride, height); break;
1239 }
1240 }
1241
1242 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1243 int i,j;
1244 for (i=0; i < height; i++) {
1245 for (j=0; j < width; j++) {
1246 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1247 }
1248 src += stride;
1249 dst += stride;
1250 }
1251 }
1252
1253 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1254 int i,j;
1255 for (i=0; i < height; i++) {
1256 for (j=0; j < width; j++) {
1257 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1258 }
1259 src += stride;
1260 dst += stride;
1261 }
1262 }
1263
1264 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1265 int i,j;
1266 for (i=0; i < height; i++) {
1267 for (j=0; j < width; j++) {
1268 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1269 }
1270 src += stride;
1271 dst += stride;
1272 }
1273 }
1274
1275 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1276 int i,j;
1277 for (i=0; i < height; i++) {
1278 for (j=0; j < width; j++) {
1279 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1280 }
1281 src += stride;
1282 dst += stride;
1283 }
1284 }
1285
1286 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1287 int i,j;
1288 for (i=0; i < height; i++) {
1289 for (j=0; j < width; j++) {
1290 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1291 }
1292 src += stride;
1293 dst += stride;
1294 }
1295 }
1296
1297 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1298 int i,j;
1299 for (i=0; i < height; i++) {
1300 for (j=0; j < width; j++) {
1301 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1302 }
1303 src += stride;
1304 dst += stride;
1305 }
1306 }
1307
1308 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1309 int i,j;
1310 for (i=0; i < height; i++) {
1311 for (j=0; j < width; j++) {
1312 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1313 }
1314 src += stride;
1315 dst += stride;
1316 }
1317 }
1318
1319 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1320 int i,j;
1321 for (i=0; i < height; i++) {
1322 for (j=0; j < width; j++) {
1323 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1324 }
1325 src += stride;
1326 dst += stride;
1327 }
1328 }
1329 #if 0
1330 #define TPEL_WIDTH(width)\
1331 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1332 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1333 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1334 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1335 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1336 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1337 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1338 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1339 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1340 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1341 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1342 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1343 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1344 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1345 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1346 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1347 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1348 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1349 #endif
1350
1351 #define H264_CHROMA_MC(OPNAME, OP)\
1352 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1353 const int A=(8-x)*(8-y);\
1354 const int B=( x)*(8-y);\
1355 const int C=(8-x)*( y);\
1356 const int D=( x)*( y);\
1357 int i;\
1358 \
1359 assert(x<8 && y<8 && x>=0 && y>=0);\
1360 \
1361 for(i=0; i<h; i++)\
1362 {\
1363 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1364 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1365 dst+= stride;\
1366 src+= stride;\
1367 }\
1368 }\
1369 \
1370 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1371 const int A=(8-x)*(8-y);\
1372 const int B=( x)*(8-y);\
1373 const int C=(8-x)*( y);\
1374 const int D=( x)*( y);\
1375 int i;\
1376 \
1377 assert(x<8 && y<8 && x>=0 && y>=0);\
1378 \
1379 for(i=0; i<h; i++)\
1380 {\
1381 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1382 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1383 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1384 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1385 dst+= stride;\
1386 src+= stride;\
1387 }\
1388 }\
1389 \
1390 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1391 const int A=(8-x)*(8-y);\
1392 const int B=( x)*(8-y);\
1393 const int C=(8-x)*( y);\
1394 const int D=( x)*( y);\
1395 int i;\
1396 \
1397 assert(x<8 && y<8 && x>=0 && y>=0);\
1398 \
1399 for(i=0; i<h; i++)\
1400 {\
1401 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1402 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1403 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1404 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1405 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1406 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1407 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1408 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1409 dst+= stride;\
1410 src+= stride;\
1411 }\
1412 }
1413
1414 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1415 #define op_put(a, b) a = (((b) + 32)>>6)
1416
1417 H264_CHROMA_MC(put_ , op_put)
1418 H264_CHROMA_MC(avg_ , op_avg)
1419 #undef op_avg
1420 #undef op_put
1421
1422 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1423 {
1424 int i;
1425 for(i=0; i<h; i++)
1426 {
1427 ST32(dst , LD32(src ));
1428 dst+=dstStride;
1429 src+=srcStride;
1430 }
1431 }
1432
1433 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1434 {
1435 int i;
1436 for(i=0; i<h; i++)
1437 {
1438 ST32(dst , LD32(src ));
1439 ST32(dst+4 , LD32(src+4 ));
1440 dst+=dstStride;
1441 src+=srcStride;
1442 }
1443 }
1444
1445 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1446 {
1447 int i;
1448 for(i=0; i<h; i++)
1449 {
1450 ST32(dst , LD32(src ));
1451 ST32(dst+4 , LD32(src+4 ));
1452 ST32(dst+8 , LD32(src+8 ));
1453 ST32(dst+12, LD32(src+12));
1454 dst+=dstStride;
1455 src+=srcStride;
1456 }
1457 }
1458
1459 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1460 {
1461 int i;
1462 for(i=0; i<h; i++)
1463 {
1464 ST32(dst , LD32(src ));
1465 ST32(dst+4 , LD32(src+4 ));
1466 ST32(dst+8 , LD32(src+8 ));
1467 ST32(dst+12, LD32(src+12));
1468 dst[16]= src[16];
1469 dst+=dstStride;
1470 src+=srcStride;
1471 }
1472 }
1473
1474 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1475 {
1476 int i;
1477 for(i=0; i<h; i++)
1478 {
1479 ST32(dst , LD32(src ));
1480 ST32(dst+4 , LD32(src+4 ));
1481 dst[8]= src[8];
1482 dst+=dstStride;
1483 src+=srcStride;
1484 }
1485 }
1486
1487
1488 #define QPEL_MC(r, OPNAME, RND, OP) \
1489 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1490 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1491 int i;\
1492 for(i=0; i<h; i++)\
1493 {\
1494 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1495 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1496 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1497 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1498 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1499 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1500 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1501 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1502 dst+=dstStride;\
1503 src+=srcStride;\
1504 }\
1505 }\
1506 \
1507 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1508 const int w=8;\
1509 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1510 int i;\
1511 for(i=0; i<w; i++)\
1512 {\
1513 const int src0= src[0*srcStride];\
1514 const int src1= src[1*srcStride];\
1515 const int src2= src[2*srcStride];\
1516 const int src3= src[3*srcStride];\
1517 const int src4= src[4*srcStride];\
1518 const int src5= src[5*srcStride];\
1519 const int src6= src[6*srcStride];\
1520 const int src7= src[7*srcStride];\
1521 const int src8= src[8*srcStride];\
1522 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1523 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1524 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1525 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1526 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1527 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1528 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1529 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1530 dst++;\
1531 src++;\
1532 }\
1533 }\
1534 \
1535 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1536 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1537 int i;\
1538 \
1539 for(i=0; i<h; i++)\
1540 {\
1541 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1542 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1543 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1544 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1545 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1546 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1547 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1548 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1549 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1550 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1551 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1552 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1553 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1554 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1555 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1556 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1557 dst+=dstStride;\
1558 src+=srcStride;\
1559 }\
1560 }\
1561 \
1562 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1563 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1564 int i;\
1565 const int w=16;\
1566 for(i=0; i<w; i++)\
1567 {\
1568 const int src0= src[0*srcStride];\
1569 const int src1= src[1*srcStride];\
1570 const int src2= src[2*srcStride];\
1571 const int src3= src[3*srcStride];\
1572 const int src4= src[4*srcStride];\
1573 const int src5= src[5*srcStride];\
1574 const int src6= src[6*srcStride];\
1575 const int src7= src[7*srcStride];\
1576 const int src8= src[8*srcStride];\
1577 const int src9= src[9*srcStride];\
1578 const int src10= src[10*srcStride];\
1579 const int src11= src[11*srcStride];\
1580 const int src12= src[12*srcStride];\
1581 const int src13= src[13*srcStride];\
1582 const int src14= src[14*srcStride];\
1583 const int src15= src[15*srcStride];\
1584 const int src16= src[16*srcStride];\
1585 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1586 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1587 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1588 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1589 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1590 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1591 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1592 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1593 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1594 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1595 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1596 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1597 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1598 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1599 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1600 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1601 dst++;\
1602 src++;\
1603 }\
1604 }\
1605 \
1606 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1607 OPNAME ## pixels8_c(dst, src, stride, 8);\
1608 }\
1609 \
1610 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1611 uint8_t half[64];\
1612 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1613 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1614 }\
1615 \
1616 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1617 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1618 }\
1619 \
1620 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1621 uint8_t half[64];\
1622 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1623 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1624 }\
1625 \
1626 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1627 uint8_t full[16*9];\
1628 uint8_t half[64];\
1629 copy_block9(full, src, 16, stride, 9);\
1630 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1631 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1632 }\
1633 \
1634 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1635 uint8_t full[16*9];\
1636 copy_block9(full, src, 16, stride, 9);\
1637 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1638 }\
1639 \
1640 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1641 uint8_t full[16*9];\
1642 uint8_t half[64];\
1643 copy_block9(full, src, 16, stride, 9);\
1644 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1645 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1646 }\
1647 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1648 uint8_t full[16*9];\
1649 uint8_t halfH[72];\
1650 uint8_t halfV[64];\
1651 uint8_t halfHV[64];\
1652 copy_block9(full, src, 16, stride, 9);\
1653 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1654 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1655 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1656 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1657 }\
1658 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1659 uint8_t full[16*9];\
1660 uint8_t halfH[72];\
1661 uint8_t halfHV[64];\
1662 copy_block9(full, src, 16, stride, 9);\
1663 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1664 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1665 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1666 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1667 }\
1668 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1669 uint8_t full[16*9];\
1670 uint8_t halfH[72];\
1671 uint8_t halfV[64];\
1672 uint8_t halfHV[64];\
1673 copy_block9(full, src, 16, stride, 9);\
1674 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1675 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1676 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1677 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1678 }\
1679 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1680 uint8_t full[16*9];\
1681 uint8_t halfH[72];\
1682 uint8_t halfHV[64];\
1683 copy_block9(full, src, 16, stride, 9);\
1684 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1685 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1686 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1687 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1688 }\
1689 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1690 uint8_t full[16*9];\
1691 uint8_t halfH[72];\
1692 uint8_t halfV[64];\
1693 uint8_t halfHV[64];\
1694 copy_block9(full, src, 16, stride, 9);\
1695 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1696 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1697 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1698 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1699 }\
1700 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1701 uint8_t full[16*9];\
1702 uint8_t halfH[72];\
1703 uint8_t halfHV[64];\
1704 copy_block9(full, src, 16, stride, 9);\
1705 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1706 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1707 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1708 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1709 }\
1710 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
1712 uint8_t halfH[72];\
1713 uint8_t halfV[64];\
1714 uint8_t halfHV[64];\
1715 copy_block9(full, src, 16, stride, 9);\
1716 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1717 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1718 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1719 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1720 }\
1721 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1722 uint8_t full[16*9];\
1723 uint8_t halfH[72];\
1724 uint8_t halfHV[64];\
1725 copy_block9(full, src, 16, stride, 9);\
1726 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1727 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1728 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1729 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1730 }\
1731 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1732 uint8_t halfH[72];\
1733 uint8_t halfHV[64];\
1734 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1735 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1737 }\
1738 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1739 uint8_t halfH[72];\
1740 uint8_t halfHV[64];\
1741 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1742 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1743 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1744 }\
1745 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1746 uint8_t full[16*9];\
1747 uint8_t halfH[72];\
1748 uint8_t halfV[64];\
1749 uint8_t halfHV[64];\
1750 copy_block9(full, src, 16, stride, 9);\
1751 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1752 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1753 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1754 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1755 }\
1756 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1757 uint8_t full[16*9];\
1758 uint8_t halfH[72];\
1759 copy_block9(full, src, 16, stride, 9);\
1760 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1762 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1763 }\
1764 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1765 uint8_t full[16*9];\
1766 uint8_t halfH[72];\
1767 uint8_t halfV[64];\
1768 uint8_t halfHV[64];\
1769 copy_block9(full, src, 16, stride, 9);\
1770 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1771 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1772 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1774 }\
1775 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1776 uint8_t full[16*9];\
1777 uint8_t halfH[72];\
1778 copy_block9(full, src, 16, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1781 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1782 }\
1783 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1784 uint8_t halfH[72];\
1785 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1786 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1787 }\
1788 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1789 OPNAME ## pixels16_c(dst, src, stride, 16);\
1790 }\
1791 \
1792 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1793 uint8_t half[256];\
1794 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1795 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1796 }\
1797 \
1798 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1799 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1800 }\
1801 \
1802 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1803 uint8_t half[256];\
1804 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1805 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1806 }\
1807 \
1808 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1809 uint8_t full[24*17];\
1810 uint8_t half[256];\
1811 copy_block17(full, src, 24, stride, 17);\
1812 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1813 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1814 }\
1815 \
1816 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1817 uint8_t full[24*17];\
1818 copy_block17(full, src, 24, stride, 17);\
1819 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1820 }\
1821 \
1822 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1823 uint8_t full[24*17];\
1824 uint8_t half[256];\
1825 copy_block17(full, src, 24, stride, 17);\
1826 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1827 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1828 }\
1829 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1830 uint8_t full[24*17];\
1831 uint8_t halfH[272];\
1832 uint8_t halfV[256];\
1833 uint8_t halfHV[256];\
1834 copy_block17(full, src, 24, stride, 17);\
1835 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1836 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1837 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1838 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1839 }\
1840 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1841 uint8_t full[24*17];\
1842 uint8_t halfH[272];\
1843 uint8_t halfHV[256];\
1844 copy_block17(full, src, 24, stride, 17);\
1845 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1846 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1847 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1848 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1849 }\
1850 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1851 uint8_t full[24*17];\
1852 uint8_t halfH[272];\
1853 uint8_t halfV[256];\
1854 uint8_t halfHV[256];\
1855 copy_block17(full, src, 24, stride, 17);\
1856 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1857 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1858 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1859 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1860 }\
1861 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1862 uint8_t full[24*17];\
1863 uint8_t halfH[272];\
1864 uint8_t halfHV[256];\
1865 copy_block17(full, src, 24, stride, 17);\
1866 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1867 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1868 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1869 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1870 }\
1871 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1872 uint8_t full[24*17];\
1873 uint8_t halfH[272];\
1874 uint8_t halfV[256];\
1875 uint8_t halfHV[256];\
1876 copy_block17(full, src, 24, stride, 17);\
1877 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1878 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1879 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1880 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1881 }\
1882 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1883 uint8_t full[24*17];\
1884 uint8_t halfH[272];\
1885 uint8_t halfHV[256];\
1886 copy_block17(full, src, 24, stride, 17);\
1887 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1888 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1889 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1890 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1891 }\
1892 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
1894 uint8_t halfH[272];\
1895 uint8_t halfV[256];\
1896 uint8_t halfHV[256];\
1897 copy_block17(full, src, 24, stride, 17);\
1898 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1899 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1900 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1901 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1902 }\
1903 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1904 uint8_t full[24*17];\
1905 uint8_t halfH[272];\
1906 uint8_t halfHV[256];\
1907 copy_block17(full, src, 24, stride, 17);\
1908 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1909 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1910 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1911 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1912 }\
1913 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t halfH[272];\
1915 uint8_t halfHV[256];\
1916 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1917 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1919 }\
1920 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1921 uint8_t halfH[272];\
1922 uint8_t halfHV[256];\
1923 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1924 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1925 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1926 }\
1927 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1928 uint8_t full[24*17];\
1929 uint8_t halfH[272];\
1930 uint8_t halfV[256];\
1931 uint8_t halfHV[256];\
1932 copy_block17(full, src, 24, stride, 17);\
1933 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1934 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1935 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1936 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1937 }\
1938 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1939 uint8_t full[24*17];\
1940 uint8_t halfH[272];\
1941 copy_block17(full, src, 24, stride, 17);\
1942 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1944 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1945 }\
1946 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1947 uint8_t full[24*17];\
1948 uint8_t halfH[272];\
1949 uint8_t halfV[256];\
1950 uint8_t halfHV[256];\
1951 copy_block17(full, src, 24, stride, 17);\
1952 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1953 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1954 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1956 }\
1957 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1958 uint8_t full[24*17];\
1959 uint8_t halfH[272];\
1960 copy_block17(full, src, 24, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1963 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1964 }\
1965 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1966 uint8_t halfH[272];\
1967 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1968 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1969 }
1970
1971 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1972 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1973 #define op_put(a, b) a = cm[((b) + 16)>>5]
1974 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1975
1976 QPEL_MC(0, put_ , _ , op_put)
1977 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1978 QPEL_MC(0, avg_ , _ , op_avg)
1979 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1980 #undef op_avg
1981 #undef op_avg_no_rnd
1982 #undef op_put
1983 #undef op_put_no_rnd
1984
1985 #if 1
1986 #define H264_LOWPASS(OPNAME, OP, OP2) \
1987 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1988 const int h=4;\
1989 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1990 int i;\
1991 for(i=0; i<h; i++)\
1992 {\
1993 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1994 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1995 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1996 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1997 dst+=dstStride;\
1998 src+=srcStride;\
1999 }\
2000 }\
2001 \
2002 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2003 const int w=4;\
2004 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2005 int i;\
2006 for(i=0; i<w; i++)\
2007 {\
2008 const int srcB= src[-2*srcStride];\
2009 const int srcA= src[-1*srcStride];\
2010 const int src0= src[0 *srcStride];\
2011 const int src1= src[1 *srcStride];\
2012 const int src2= src[2 *srcStride];\
2013 const int src3= src[3 *srcStride];\
2014 const int src4= src[4 *srcStride];\
2015 const int src5= src[5 *srcStride];\
2016 const int src6= src[6 *srcStride];\
2017 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2018 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2019 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2020 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2021 dst++;\
2022 src++;\
2023 }\
2024 }\
2025 \
2026 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2027 const int h=4;\
2028 const int w=4;\
2029 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2030 int i;\
2031 src -= 2*srcStride;\
2032 for(i=0; i<h+5; i++)\
2033 {\
2034 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2035 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2036 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2037 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2038 tmp+=tmpStride;\
2039 src+=srcStride;\
2040 }\
2041 tmp -= tmpStride*(h+5-2);\
2042 for(i=0; i<w; i++)\
2043 {\
2044 const int tmpB= tmp[-2*tmpStride];\
2045 const int tmpA= tmp[-1*tmpStride];\
2046 const int tmp0= tmp[0 *tmpStride];\
2047 const int tmp1= tmp[1 *tmpStride];\
2048 const int tmp2= tmp[2 *tmpStride];\
2049 const int tmp3= tmp[3 *tmpStride];\
2050 const int tmp4= tmp[4 *tmpStride];\
2051 const int tmp5= tmp[5 *tmpStride];\
2052 const int tmp6= tmp[6 *tmpStride];\
2053 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2054 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2055 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2056 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2057 dst++;\
2058 tmp++;\
2059 }\
2060 }\
2061 \
2062 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2063 const int h=8;\
2064 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2065 int i;\
2066 for(i=0; i<h; i++)\
2067 {\
2068 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2069 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2070 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2071 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2072 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2073 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2074 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2075 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2076 dst+=dstStride;\
2077 src+=srcStride;\
2078 }\
2079 }\
2080 \
2081 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2082 const int w=8;\
2083 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2084 int i;\
2085 for(i=0; i<w; i++)\
2086 {\
2087 const int srcB= src[-2*srcStride];\
2088 const int srcA= src[-1*srcStride];\
2089 const int src0= src[0 *srcStride];\
2090 const int src1= src[1 *srcStride];\
2091 const int src2= src[2 *srcStride];\
2092 const int src3= src[3 *srcStride];\
2093 const int src4= src[4 *srcStride];\
2094 const int src5= src[5 *srcStride];\
2095 const int src6= src[6 *srcStride];\
2096 const int src7= src[7 *srcStride];\
2097 const int src8= src[8 *srcStride];\
2098 const int src9= src[9 *srcStride];\
2099 const int src10=src[10*srcStride];\
2100 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2101 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2102 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2103 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2104 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2105 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2106 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2107 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2108 dst++;\
2109 src++;\
2110 }\
2111 }\
2112 \
2113 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2114 const int h=8;\
2115 const int w=8;\
2116 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2117 int i;\
2118 src -= 2*srcStride;\
2119 for(i=0; i<h+5; i++)\
2120 {\
2121 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2122 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2123 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2124 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2125 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2126 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2127 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2128 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2129 tmp+=tmpStride;\
2130 src+=srcStride;\
2131 }\
2132 tmp -= tmpStride*(h+5-2);\
2133 for(i=0; i<w; i++)\
2134 {\
2135 const int tmpB= tmp[-2*tmpStride];\
2136 const int tmpA= tmp[-1*tmpStride];\
2137 const int tmp0= tmp[0 *tmpStride];\
2138 const int tmp1= tmp[1 *tmpStride];\
2139 const int tmp2= tmp[2 *tmpStride];\
2140 const int tmp3= tmp[3 *tmpStride];\
2141 const int tmp4= tmp[4 *tmpStride];\
2142 const int tmp5= tmp[5 *tmpStride];\
2143 const int tmp6= tmp[6 *tmpStride];\
2144 const int tmp7= tmp[7 *tmpStride];\
2145 const int tmp8= tmp[8 *tmpStride];\
2146 const int tmp9= tmp[9 *tmpStride];\
2147 const int tmp10=tmp[10*tmpStride];\
2148 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2149 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2150 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2151 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2152 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2153 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2154 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2155 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2156 dst++;\
2157 tmp++;\
2158 }\
2159 }\
2160 \
2161 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2162 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2163 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2164 src += 8*srcStride;\
2165 dst += 8*dstStride;\
2166 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2167 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2168 }\
2169 \
2170 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2171 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2172 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2173 src += 8*srcStride;\
2174 dst += 8*dstStride;\
2175 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2176 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2177 }\
2178 \
2179 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2180 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2181 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2182 src += 8*srcStride;\
2183 dst += 8*dstStride;\
2184 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2185 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2186 }\
2187
2188 #define H264_MC(OPNAME, SIZE) \
2189 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2190 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2191 }\
2192 \
2193 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2194 uint8_t half[SIZE*SIZE];\
2195 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2196 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2197 }\
2198 \
2199 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2200 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2201 }\
2202 \
2203 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2204 uint8_t half[SIZE*SIZE];\
2205 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2206 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2207 }\
2208 \
2209 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2210 uint8_t full[SIZE*(SIZE+5)];\
2211 uint8_t * const full_mid= full + SIZE*2;\
2212 uint8_t half[SIZE*SIZE];\
2213 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2214 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2215 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2216 }\
2217 \
2218 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2219 uint8_t full[SIZE*(SIZE+5)];\
2220 uint8_t * const full_mid= full + SIZE*2;\
2221 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2222 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2223 }\
2224 \
2225 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2226 uint8_t full[SIZE*(SIZE+5)];\
2227 uint8_t * const full_mid= full + SIZE*2;\
2228 uint8_t half[SIZE*SIZE];\
2229 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2230 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2231 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2232 }\
2233 \
2234 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2235 uint8_t full[SIZE*(SIZE+5)];\
2236 uint8_t * const full_mid= full + SIZE*2;\
2237 uint8_t halfH[SIZE*SIZE];\
2238 uint8_t halfV[SIZE*SIZE];\
2239 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2240 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2241 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2242 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2243 }\
2244 \
2245 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2246 uint8_t full[SIZE*(SIZE+5)];\
2247 uint8_t * const full_mid= full + SIZE*2;\
2248 uint8_t halfH[SIZE*SIZE];\
2249 uint8_t halfV[SIZE*SIZE];\
2250 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2251 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2252 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2253 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2254 }\
2255 \
2256 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2257 uint8_t full[SIZE*(SIZE+5)];\
2258 uint8_t * const full_mid= full + SIZE*2;\
2259 uint8_t halfH[SIZE*SIZE];\
2260 uint8_t halfV[SIZE*SIZE];\
2261 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2262 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2263 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2264 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2265 }\
2266 \
2267 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2268 uint8_t full[SIZE*(SIZE+5)];\
2269 uint8_t * const full_mid= full + SIZE*2;\
2270 uint8_t halfH[SIZE*SIZE];\
2271 uint8_t halfV[SIZE*SIZE];\
2272 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2273 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2274 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2275 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2276 }\
2277 \
2278 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2279 int16_t tmp[SIZE*(SIZE+5)];\
2280 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2281 }\
2282 \
2283 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2284 int16_t tmp[SIZE*(SIZE+5)];\
2285 uint8_t halfH[SIZE*SIZE];\
2286 uint8_t halfHV[SIZE*SIZE];\
2287 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2288 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2289 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2290 }\
2291 \
2292 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2293 int16_t tmp[SIZE*(SIZE+5)];\
2294 uint8_t halfH[SIZE*SIZE];\
2295 uint8_t halfHV[SIZE*SIZE];\
2296 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2297 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2298 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2299 }\
2300 \
2301 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2302 uint8_t full[SIZE*(SIZE+5)];\
2303 uint8_t * const full_mid= full + SIZE*2;\
2304 int16_t tmp[SIZE*(SIZE+5)];\
2305 uint8_t halfV[SIZE*SIZE];\
2306 uint8_t halfHV[SIZE*SIZE];\
2307 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2308 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2309 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2310 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2311 }\
2312 \
2313 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2314 uint8_t full[SIZE*(SIZE+5)];\
2315 uint8_t * const full_mid= full + SIZE*2;\
2316 int16_t tmp[SIZE*(SIZE+5)];\
2317 uint8_t halfV[SIZE*SIZE];\
2318 uint8_t halfHV[SIZE*SIZE];\
2319 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2320 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2321 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2322 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2323 }\
2324
2325 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2326 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2327 #define op_put(a, b) a = cm[((b) + 16)>>5]
2328 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2329 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2330
2331 H264_LOWPASS(put_ , op_put, op2_put)
2332 H264_LOWPASS(avg_ , op_avg, op2_avg)
2333 H264_MC(put_, 4)
2334 H264_MC(put_, 8)
2335 H264_MC(put_, 16)
2336 H264_MC(avg_, 4)
2337 H264_MC(avg_, 8)
2338 H264_MC(avg_, 16)
2339
2340 #undef op_avg
2341 #undef op_put
2342 #undef op2_avg
2343 #undef op2_put
2344 #endif
2345
2346 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2347 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2348 int i;
2349
2350 for(i=0; i<h; i++){
2351 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2352 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2353 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2354 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2355 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2356 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2357 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2358 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2359 dst+=dstStride;
2360 src+=srcStride;
2361 }
2362 }
2363
2364 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2365 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2366 int i;
2367
2368 for(i=0; i<w; i++){
2369 const int src_1= src[ -srcStride];
2370 const int src0 = src[0 ];
2371 const int src1 = src[ srcStride];
2372 const int src2 = src[2*srcStride];
2373 const int src3 = src[3*srcStride];
2374 const int src4 = src[4*srcStride];
2375 const int src5 = src[5*srcStride];
2376 const int src6 = src[6*srcStride];
2377 const int src7 = src[7*srcStride];
2378 const int src8 = src[8*srcStride];
2379 const int src9 = src[9*srcStride];
2380 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2381 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2382 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2383 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2384 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2385 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2386 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2387 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2388 src++;
2389 dst++;
2390 }
2391 }
2392
2393 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2394 put_pixels8_c(dst, src, stride, 8);
2395 }
2396
2397 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2398 uint8_t half[64];
2399 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2400 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2401 }
2402
2403 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2404 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2405 }
2406
2407 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2408 uint8_t half[64];
2409 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2410 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2411 }
2412
2413 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2414 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2415 }
2416
2417 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2418 uint8_t halfH[88];
2419 uint8_t halfV[64];
2420 uint8_t halfHV[64];
2421 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2422 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2423 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2424 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2425 }
2426 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2427 uint8_t halfH[88];
2428 uint8_t halfV[64];
2429 uint8_t halfHV[64];
2430 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2431 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2432 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2433 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2434 }
2435 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2436 uint8_t halfH[88];
2437 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2438 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2439 }
2440
2441 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2442 int x;
2443 const int strength= ff_h263_loop_filter_strength[qscale];
2444
2445 for(x=0; x<8; x++){
2446 int d1, d2, ad1;
2447 int p0= src[x-2*stride];
2448 int p1= src[x-1*stride];
2449 int p2= src[x+0*stride];
2450 int p3= src[x+1*stride];
2451 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2452
2453 if (d<-2*strength) d1= 0;
2454 else if(d<- strength) d1=-2*strength - d;
2455 else if(d< strength) d1= d;
2456 else if(d< 2*strength) d1= 2*strength - d;
2457 else d1= 0;
2458
2459 p1 += d1;
2460 p2 -= d1;
2461 if(p1&256) p1= ~(p1>>31);
2462 if(p2&256) p2= ~(p2>>31);
2463
2464 src[x-1*stride] = p1;
2465 src[x+0*stride] = p2;
2466
2467 ad1= ABS(d1)>>1;
2468
2469 d2= clip((p0-p3)/4, -ad1, ad1);
2470
2471 src[x-2*stride] = p0 - d2;
2472 src[x+ stride] = p3 + d2;
2473 }
2474 }
2475
2476 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2477 int y;
2478 const int strength= ff_h263_loop_filter_strength[qscale];
2479
2480 for(y=0; y<8; y++){
2481 int d1, d2, ad1;
2482 int p0= src[y*stride-2];
2483 int p1= src[y*stride-1];
2484 int p2= src[y*stride+0];
2485 int p3= src[y*stride+1];
2486 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2487
2488 if (d<-2*strength) d1= 0;
2489 else if(d<- strength) d1=-2*strength - d;
2490 else if(d< strength) d1= d;
2491 else if(d< 2*strength) d1= 2*strength - d;
2492 else d1= 0;
2493
2494 p1 += d1;
2495 p2 -= d1;
2496 if(p1&256) p1= ~(p1>>31);
2497 if(p2&256) p2= ~(p2>>31);
2498
2499 src[y*stride-1] = p1;
2500 src[y*stride+0] = p2;
2501
2502 ad1= ABS(d1)>>1;
2503
2504 d2= clip((p0-p3)/4, -ad1, ad1);
2505
2506 src[y*stride-2] = p0 - d2;
2507 src[y*stride+1] = p3 + d2;
2508 }
2509 }
2510
2511 static void h261_loop_filter_c(uint8_t *src, int stride){
2512 int x,y,xy,yz;
2513 int temp[64];
2514
2515 for(x=0; x<8; x++){
2516 temp[x ] = 4*src[x ];
2517 temp[x + 7*8] = 4*src[x + 7*stride];
2518 }
2519 for(y=1; y<7; y++){
2520 for(x=0; x<8; x++){
2521 xy = y * stride + x;
2522 yz = y * 8 + x;
2523 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2524 }
2525 }
2526
2527 for(y=0; y<8; y++){
2528 src[ y*stride] = (temp[ y*8] + 2)>>2;
2529 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2530 for(x=1; x<7; x++){
2531 xy = y * stride + x;
2532 yz = y * 8 + x;
2533 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2534 }
2535 }
2536 }
2537
2538 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2539 {
2540 int s, i;
2541
2542 s = 0;
2543 for(i=0;i<h;i++) {
2544 s += abs(pix1[0] - pix2[0]);
2545 s += abs(pix1[1] - pix2[1]);
2546 s += abs(pix1[2] - pix2[2]);
2547 s += abs(pix1[3] - pix2[3]);
2548 s += abs(pix1[4] - pix2[4]);
2549 s += abs(pix1[5] - pix2[5]);
2550 s += abs(pix1[6] - pix2[6]);
2551 s += abs(pix1[7] - pix2[7]);
2552 s += abs(pix1[8] - pix2[8]);
2553 s += abs(pix1[9] - pix2[9]);
2554 s += abs(pix1[10] - pix2[10]);
2555 s += abs(pix1[11] - pix2[11]);
2556 s += abs(pix1[12] - pix2[12]);
2557 s += abs(pix1[13] - pix2[13]);
2558 s += abs(pix1[14] - pix2[14]);
2559 s += abs(pix1[15] - pix2[15]);
2560 pix1 += line_size;
2561 pix2 += line_size;
2562 }
2563 return s;
2564 }
2565
2566 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2567 {
2568 int s, i;
2569
2570 s = 0;
2571 for(i=0;i<h;i++) {
2572 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2573 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2574 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2575 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2576 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2577 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2578 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2579 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2580 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2581 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2582 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2583 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2584 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2585 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2586 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2587 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2588 pix1 += line_size;
2589 pix2 += line_size;
2590 }
2591 return s;
2592 }
2593
2594 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2595 {
2596 int s, i;
2597 uint8_t *pix3 = pix2 + line_size;
2598
2599 s = 0;
2600 for(i=0;i<h;i++) {
2601 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2602 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2603 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2604 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2605 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2606 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2607 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2608 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2609 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2610 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2611 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2612 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2613 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2614 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2615 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2616 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2617 pix1 += line_size;
2618 pix2 += line_size;
2619 pix3 += line_size;
2620 }
2621 return s;
2622 }
2623
2624 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2625 {
2626 int s, i;
2627 uint8_t *pix3 = pix2 + line_size;
2628
2629 s = 0;
2630 for(i=0;i<h;i++) {
2631 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2632 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2633 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2634 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2635 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2636 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2637 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2638 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2639 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2640 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2641 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2642 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2643 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2644 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2645 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2646 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2647 pix1 += line_size;
2648 pix2 += line_size;
2649 pix3 += line_size;
2650 }
2651 return s;
2652 }
2653
2654 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2655 {
2656 int s, i;
2657
2658 s = 0;
2659 for(i=0;i<h;i++) {
2660 s += abs(pix1[0] - pix2[0]);
2661 s += abs(pix1[1] - pix2[1]);
2662 s += abs(pix1[2] - pix2[2]);
2663 s += abs(pix1[3] - pix2[3]);
2664 s += abs(pix1[4] - pix2[4]);
2665 s += abs(pix1[5] - pix2[5]);
2666 s += abs(pix1[6] - pix2[6]);
2667 s += abs(pix1[7] - pix2[7]);
2668 pix1 += line_size;
2669 pix2 += line_size;
2670 }
2671 return s;
2672 }
2673
2674 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2675 {
2676 int s, i;
2677
2678 s = 0;
2679 for(i=0;i<h;i++) {
2680 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2681 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2682 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2683 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2684 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2685 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2686 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2687 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2688 pix1 += line_size;
2689 pix2 += line_size;
2690 }
2691 return s;
2692 }
2693
2694 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2695 {
2696 int s, i;
2697 uint8_t *pix3 = pix2 + line_size;
2698
2699 s = 0;
2700 for(i=0;i<h;i++) {
2701 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2702 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2703 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2704 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2705 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2706 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2707 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2708 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2709 pix1 += line_size;
2710 pix2 += line_size;
2711 pix3 += line_size;
2712 }
2713 return s;
2714 }
2715
2716 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2717 {
2718 int s, i;
2719 uint8_t *pix3 = pix2 + line_size;
2720
2721 s = 0;
2722 for(i=0;i<h;i++) {
2723 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2724 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2725 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2726 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2727 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2728 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2729 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2730 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2731 pix1 += line_size;
2732 pix2 += line_size;
2733 pix3 += line_size;
2734 }
2735 return s;
2736 }
2737
2738 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2739 int score1=0;
2740 int score2=0;
2741 int x,y;
2742
2743 for(y=0; y<h; y++){
2744 for(x=0; x<16; x++){
2745 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2746 }
2747 if(y+1<h){
2748 for(x=0; x<15; x++){
2749 score2+= ABS( s1[x ] - s1[x +stride]
2750 - s1[x+1] + s1[x+1+stride])
2751 -ABS( s2[x ] - s2[x +stride]
2752 - s2[x+1] + s2[x+1+stride]);
2753 }
2754 }
2755 s1+= stride;
2756 s2+= stride;
2757 }
2758
2759 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2760 else return score1 + ABS(score2)*8;
2761 }
2762
2763 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2764 int score1=0;
2765 int score2=0;
2766 int x,y;
2767
2768 for(y=0; y<h; y++){
2769 for(x=0; x<8; x++){
2770 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2771 }
2772 if(y+1<h){
2773 for(x=0; x<7; x++){
2774 score2+= ABS( s1[x ] - s1[x +stride]
2775 - s1[x+1] + s1[x+1+stride])
2776 -ABS( s2[x ] - s2[x +stride]
2777 - s2[x+1] + s2[x+1+stride]);
2778 }
2779 }
2780 s1+= stride;
2781 s2+= stride;
2782 }
2783
2784 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2785 else return score1 + ABS(score2)*8;
2786 }
2787
2788 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2789 int i;
2790 unsigned int sum=0;
2791
2792 for(i=0; i<8*8; i++){
2793 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2794 int w= weight[i];
2795 b>>= RECON_SHIFT;
2796 assert(-512<b && b<512);
2797
2798 sum += (w*b)*(w*b)>>4;
2799 }
2800 return sum>>2;
2801 }
2802
2803 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2804 int i;
2805
2806 for(i=0; i<8*8; i++){
2807 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2808 }
2809 }
2810
2811 /**
2812 * permutes an 8x8 block.
2813 * @param block the block which will be permuted according to the given permutation vector
2814 * @param permutation the permutation vector
2815 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2816 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2817 * (inverse) permutated to scantable order!
2818 */
2819 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2820 {
2821 int i;
2822 DCTELEM temp[64];
2823
2824 if(last<=0) return;
2825 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2826
2827 for(i=0; i<=last; i++){
2828 const int j= scantable[i];
2829 temp[j]= block[j];
2830 block[j]=0;
2831 }
2832
2833 for(i=0; i<=last; i++){
2834 const int j= scantable[i];
2835 const int perm_j= permutation[j];
2836 block[perm_j]= temp[j];
2837 }
2838 }
2839
2840 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2841 return 0;
2842 }
2843
2844 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2845 int i;
2846
2847 memset(cmp, 0, sizeof(void*)*5);
2848
2849 for(i=0; i<5; i++){
2850 switch(type&0xFF){
2851 case FF_CMP_SAD:
2852 cmp[i]= c->sad[i];
2853 break;
2854 case FF_CMP_SATD:
2855 cmp[i]= c->hadamard8_diff[i];
2856 break;
2857 case FF_CMP_SSE:
2858 cmp[i]= c->sse[i];
2859 break;
2860 case FF_CMP_DCT:
2861 cmp[i]= c->dct_sad[i];
2862 break;
2863 case FF_CMP_PSNR:
2864 cmp[i]= c->quant_psnr[i];
2865 break;
2866 case FF_CMP_BIT:
2867 cmp[i]= c->bit[i];
2868 break;
2869 case FF_CMP_RD:
2870 cmp[i]= c->rd[i];
2871 break;
2872 case FF_CMP_VSAD:
2873 cmp[i]= c->vsad[i];
2874 break;
2875 case FF_CMP_VSSE:
2876 cmp[i]= c->vsse[i];
2877 break;
2878 case FF_CMP_ZERO:
2879 cmp[i]= zero_cmp;
2880 break;
2881 case FF_CMP_NSSE:
2882 cmp[i]= c->nsse[i];
2883 break;
2884 case FF_CMP_W53:
2885 cmp[i]= c->w53[i];
2886 break;
2887 case FF_CMP_W97:
2888 cmp[i]= c->w97[i];
2889 break;
2890 default:
2891 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2892 }
2893 }
2894 }
2895
2896 /**
2897 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2898 */
2899 static void clear_blocks_c(DCTELEM *blocks)
2900 {
2901 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2902 }
2903
2904 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2905 int i;
2906 for(i=0; i+7<w; i+=8){
2907 dst[i+0] += src[i+0];
2908 dst[i+1] += src[i+1];
2909 dst[i+2] += src[i+2];
2910 dst[i+3] += src[i+3];
2911 dst[i+4] += src[i+4];
2912 dst[i+5] += src[i+5];
2913 dst[i+6] += src[i+6];
2914 dst[i+7] += src[i+7];
2915 }
2916 for(; i<w; i++)
2917 dst[i+0] += src[i+0];
2918 }
2919
2920 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2921 int i;
2922 for(i=0; i+7<w; i+=8){
2923 dst[i+0] = src1[i+0]-src2[i+0];
2924 dst[i+1] = src1[i+1]-src2[i+1];
2925 dst[i+2] = src1[i+2]-src2[i+2];
2926 dst[i+3] = src1[i+3]-src2[i+3];
2927 dst[i+4] = src1[i+4]-src2[i+4];
2928 dst[i+5] = src1[i+5]-src2[i+5];
2929 dst[i+6] = src1[i+6]-src2[i+6];
2930 dst[i+7] = src1[i+7]-src2[i+7];
2931 }
2932 for(; i<w; i++)
2933 dst[i+0] = src1[i+0]-src2[i+0];
2934 }
2935
2936 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2937 int i;
2938 uint8_t l, lt;
2939
2940 l= *left;
2941 lt= *left_top;
2942
2943 for(i=0; i<w; i++){
2944 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2945 lt= src1[i];
2946 l= src2[i];
2947 dst[i]= l - pred;
2948 }
2949
2950 *left= l;
2951 *left_top= lt;
2952 }
2953
2954 #define BUTTERFLY2(o1,o2,i1,i2) \
2955 o1= (i1)+(i2);\
2956 o2= (i1)-(i2);
2957
2958 #define BUTTERFLY1(x,y) \
2959 {\
2960 int a,b;\
2961 a= x;\
2962 b= y;\
2963 x= a+b;\
2964 y= a-b;\
2965 }
2966
2967 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2968
2969 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2970 int i;
2971 int temp[64];
2972 int sum=0;
2973
2974 assert(h==8);
2975
2976 for(i=0; i<8; i++){
2977 //FIXME try pointer walks
2978 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2979 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2980 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2981 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2982
2983 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2984 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2985 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2986 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2987
2988 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2989 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2990 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2991 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2992 }
2993
2994 for(i=0; i<8; i++){
2995 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2996 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2997 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2998 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2999
3000 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3001 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3002 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3003 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3004
3005 sum +=
3006 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3007 +BUTTERFLYA(temp[8*1+i], temp[8*5+i