688f0a9618fd2d37ca4fd8b377b64892a8dbc151
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
32 #include "faandct.h"
33 #include "snow.h"
34
35 /* snow.c */
36 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
37
38 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
39 uint32_t squareTbl[512] = {0, };
40
41 const uint8_t ff_zigzag_direct[64] = {
42 0, 1, 8, 16, 9, 2, 3, 10,
43 17, 24, 32, 25, 18, 11, 4, 5,
44 12, 19, 26, 33, 40, 48, 41, 34,
45 27, 20, 13, 6, 7, 14, 21, 28,
46 35, 42, 49, 56, 57, 50, 43, 36,
47 29, 22, 15, 23, 30, 37, 44, 51,
48 58, 59, 52, 45, 38, 31, 39, 46,
49 53, 60, 61, 54, 47, 55, 62, 63
50 };
51
52 /* Specific zigzag scan for 248 idct. NOTE that unlike the
53 specification, we interleave the fields */
54 const uint8_t ff_zigzag248_direct[64] = {
55 0, 8, 1, 9, 16, 24, 2, 10,
56 17, 25, 32, 40, 48, 56, 33, 41,
57 18, 26, 3, 11, 4, 12, 19, 27,
58 34, 42, 49, 57, 50, 58, 35, 43,
59 20, 28, 5, 13, 6, 14, 21, 29,
60 36, 44, 51, 59, 52, 60, 37, 45,
61 22, 30, 7, 15, 23, 31, 38, 46,
62 53, 61, 54, 62, 39, 47, 55, 63,
63 };
64
65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
67
68 const uint8_t ff_alternate_horizontal_scan[64] = {
69 0, 1, 2, 3, 8, 9, 16, 17,
70 10, 11, 4, 5, 6, 7, 15, 14,
71 13, 12, 19, 18, 24, 25, 32, 33,
72 26, 27, 20, 21, 22, 23, 28, 29,
73 30, 31, 34, 35, 40, 41, 48, 49,
74 42, 43, 36, 37, 38, 39, 44, 45,
75 46, 47, 50, 51, 56, 57, 58, 59,
76 52, 53, 54, 55, 60, 61, 62, 63,
77 };
78
79 const uint8_t ff_alternate_vertical_scan[64] = {
80 0, 8, 16, 24, 1, 9, 2, 10,
81 17, 25, 32, 40, 48, 56, 57, 49,
82 41, 33, 26, 18, 3, 11, 4, 12,
83 19, 27, 34, 42, 50, 58, 35, 43,
84 51, 59, 20, 28, 5, 13, 6, 14,
85 21, 29, 36, 44, 52, 60, 37, 45,
86 53, 61, 22, 30, 7, 15, 23, 31,
87 38, 46, 54, 62, 39, 47, 55, 63,
88 };
89
90 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
91 const uint32_t inverse[256]={
92 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
93 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
94 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
95 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
96 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
97 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
98 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
99 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
100 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
101 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
102 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
103 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
104 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
105 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
106 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
107 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
108 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
109 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
110 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
111 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
112 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
113 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
114 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
115 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
116 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
117 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
118 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
119 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
120 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
121 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
122 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
123 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
124 };
125
126 /* Input permutation for the simple_idct_mmx */
127 static const uint8_t simple_mmx_permutation[64]={
128 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
129 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
130 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
131 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
132 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
133 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
134 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
135 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
136 };
137
138 static int pix_sum_c(uint8_t * pix, int line_size)
139 {
140 int s, i, j;
141
142 s = 0;
143 for (i = 0; i < 16; i++) {
144 for (j = 0; j < 16; j += 8) {
145 s += pix[0];
146 s += pix[1];
147 s += pix[2];
148 s += pix[3];
149 s += pix[4];
150 s += pix[5];
151 s += pix[6];
152 s += pix[7];
153 pix += 8;
154 }
155 pix += line_size - 16;
156 }
157 return s;
158 }
159
160 static int pix_norm1_c(uint8_t * pix, int line_size)
161 {
162 int s, i, j;
163 uint32_t *sq = squareTbl + 256;
164
165 s = 0;
166 for (i = 0; i < 16; i++) {
167 for (j = 0; j < 16; j += 8) {
168 #if 0
169 s += sq[pix[0]];
170 s += sq[pix[1]];
171 s += sq[pix[2]];
172 s += sq[pix[3]];
173 s += sq[pix[4]];
174 s += sq[pix[5]];
175 s += sq[pix[6]];
176 s += sq[pix[7]];
177 #else
178 #if LONG_MAX > 2147483647
179 register uint64_t x=*(uint64_t*)pix;
180 s += sq[x&0xff];
181 s += sq[(x>>8)&0xff];
182 s += sq[(x>>16)&0xff];
183 s += sq[(x>>24)&0xff];
184 s += sq[(x>>32)&0xff];
185 s += sq[(x>>40)&0xff];
186 s += sq[(x>>48)&0xff];
187 s += sq[(x>>56)&0xff];
188 #else
189 register uint32_t x=*(uint32_t*)pix;
190 s += sq[x&0xff];
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 x=*(uint32_t*)(pix+4);
195 s += sq[x&0xff];
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
199 #endif
200 #endif
201 pix += 8;
202 }
203 pix += line_size - 16;
204 }
205 return s;
206 }
207
208 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
209 int i;
210
211 for(i=0; i+8<=w; i+=8){
212 dst[i+0]= bswap_32(src[i+0]);
213 dst[i+1]= bswap_32(src[i+1]);
214 dst[i+2]= bswap_32(src[i+2]);
215 dst[i+3]= bswap_32(src[i+3]);
216 dst[i+4]= bswap_32(src[i+4]);
217 dst[i+5]= bswap_32(src[i+5]);
218 dst[i+6]= bswap_32(src[i+6]);
219 dst[i+7]= bswap_32(src[i+7]);
220 }
221 for(;i<w; i++){
222 dst[i+0]= bswap_32(src[i+0]);
223 }
224 }
225
226 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
227 {
228 int s, i;
229 uint32_t *sq = squareTbl + 256;
230
231 s = 0;
232 for (i = 0; i < h; i++) {
233 s += sq[pix1[0] - pix2[0]];
234 s += sq[pix1[1] - pix2[1]];
235 s += sq[pix1[2] - pix2[2]];
236 s += sq[pix1[3] - pix2[3]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241 }
242
243 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
244 {
245 int s, i;
246 uint32_t *sq = squareTbl + 256;
247
248 s = 0;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[0] - pix2[0]];
251 s += sq[pix1[1] - pix2[1]];
252 s += sq[pix1[2] - pix2[2]];
253 s += sq[pix1[3] - pix2[3]];
254 s += sq[pix1[4] - pix2[4]];
255 s += sq[pix1[5] - pix2[5]];
256 s += sq[pix1[6] - pix2[6]];
257 s += sq[pix1[7] - pix2[7]];
258 pix1 += line_size;
259 pix2 += line_size;
260 }
261 return s;
262 }
263
264 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
265 {
266 int s, i;
267 uint32_t *sq = squareTbl + 256;
268
269 s = 0;
270 for (i = 0; i < h; i++) {
271 s += sq[pix1[ 0] - pix2[ 0]];
272 s += sq[pix1[ 1] - pix2[ 1]];
273 s += sq[pix1[ 2] - pix2[ 2]];
274 s += sq[pix1[ 3] - pix2[ 3]];
275 s += sq[pix1[ 4] - pix2[ 4]];
276 s += sq[pix1[ 5] - pix2[ 5]];
277 s += sq[pix1[ 6] - pix2[ 6]];
278 s += sq[pix1[ 7] - pix2[ 7]];
279 s += sq[pix1[ 8] - pix2[ 8]];
280 s += sq[pix1[ 9] - pix2[ 9]];
281 s += sq[pix1[10] - pix2[10]];
282 s += sq[pix1[11] - pix2[11]];
283 s += sq[pix1[12] - pix2[12]];
284 s += sq[pix1[13] - pix2[13]];
285 s += sq[pix1[14] - pix2[14]];
286 s += sq[pix1[15] - pix2[15]];
287
288 pix1 += line_size;
289 pix2 += line_size;
290 }
291 return s;
292 }
293
294
295 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
296 #ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
297 int s, i, j;
298 const int dec_count= w==8 ? 3 : 4;
299 int tmp[16*16];
300 #if 0
301 int level, ori;
302 static const int scale[2][2][4][4]={
303 {
304 {
305 //8x8 dec=3
306 {268, 239, 239, 213},
307 { 0, 224, 224, 152},
308 { 0, 135, 135, 110},
309 },{
310 //16x16 dec=4
311 {344, 310, 310, 280},
312 { 0, 320, 320, 228},
313 { 0, 175, 175, 136},
314 { 0, 129, 129, 102},
315 }
316 },{
317 {//FIXME 5/3
318 //8x8 dec=3
319 {275, 245, 245, 218},
320 { 0, 230, 230, 156},
321 { 0, 138, 138, 113},
322 },{
323 //16x16 dec=4
324 {352, 317, 317, 286},
325 { 0, 328, 328, 233},
326 { 0, 180, 180, 140},
327 { 0, 132, 132, 105},
328 }
329 }
330 };
331 #endif
332
333 for (i = 0; i < h; i++) {
334 for (j = 0; j < w; j+=4) {
335 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
336 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
337 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
338 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
339 }
340 pix1 += line_size;
341 pix2 += line_size;
342 }
343
344 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
345
346 s=0;
347 #if 0
348 for(level=0; level<dec_count; level++){
349 for(ori= level ? 1 : 0; ori<4; ori++){
350 int sx= (ori&1) ? 1<<level: 0;
351 int stride= 16<<(dec_count-level);
352 int sy= (ori&2) ? stride>>1 : 0;
353 int size= 1<<level;
354
355 for(i=0; i<size; i++){
356 for(j=0; j<size; j++){
357 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
358 s += ABS(v);
359 }
360 }
361 }
362 }
363 #endif
364 for (i = 0; i < h; i++) {
365 for (j = 0; j < w; j+=4) {
366 s+= ABS(tmp[16*i+j+0]);
367 s+= ABS(tmp[16*i+j+1]);
368 s+= ABS(tmp[16*i+j+2]);
369 s+= ABS(tmp[16*i+j+3]);
370 }
371 }
372 assert(s>=0);
373
374 return s>>2;
375 #endif
376 }
377
378 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 1);
380 }
381
382 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 8, h, 0);
384 }
385
386 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 1);
388 }
389
390 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 16, h, 0);
392 }
393
394 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
395 {
396 int i;
397
398 /* read the pixels */
399 for(i=0;i<8;i++) {
400 block[0] = pixels[0];
401 block[1] = pixels[1];
402 block[2] = pixels[2];
403 block[3] = pixels[3];
404 block[4] = pixels[4];
405 block[5] = pixels[5];
406 block[6] = pixels[6];
407 block[7] = pixels[7];
408 pixels += line_size;
409 block += 8;
410 }
411 }
412
413 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
414 const uint8_t *s2, int stride){
415 int i;
416
417 /* read the pixels */
418 for(i=0;i<8;i++) {
419 block[0] = s1[0] - s2[0];
420 block[1] = s1[1] - s2[1];
421 block[2] = s1[2] - s2[2];
422 block[3] = s1[3] - s2[3];
423 block[4] = s1[4] - s2[4];
424 block[5] = s1[5] - s2[5];
425 block[6] = s1[6] - s2[6];
426 block[7] = s1[7] - s2[7];
427 s1 += stride;
428 s2 += stride;
429 block += 8;
430 }
431 }
432
433
434 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
435 int line_size)
436 {
437 int i;
438 uint8_t *cm = cropTbl + MAX_NEG_CROP;
439
440 /* read the pixels */
441 for(i=0;i<8;i++) {
442 pixels[0] = cm[block[0]];
443 pixels[1] = cm[block[1]];
444 pixels[2] = cm[block[2]];
445 pixels[3] = cm[block[3]];
446 pixels[4] = cm[block[4]];
447 pixels[5] = cm[block[5]];
448 pixels[6] = cm[block[6]];
449 pixels[7] = cm[block[7]];
450
451 pixels += line_size;
452 block += 8;
453 }
454 }
455
456 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
457 int line_size)
458 {
459 int i;
460 uint8_t *cm = cropTbl + MAX_NEG_CROP;
461
462 /* read the pixels */
463 for(i=0;i<4;i++) {
464 pixels[0] = cm[block[0]];
465 pixels[1] = cm[block[1]];
466 pixels[2] = cm[block[2]];
467 pixels[3] = cm[block[3]];
468
469 pixels += line_size;
470 block += 8;
471 }
472 }
473
474 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
475 int line_size)
476 {
477 int i;
478 uint8_t *cm = cropTbl + MAX_NEG_CROP;
479
480 /* read the pixels */
481 for(i=0;i<2;i++) {
482 pixels[0] = cm[block[0]];
483 pixels[1] = cm[block[1]];
484
485 pixels += line_size;
486 block += 8;
487 }
488 }
489
490 static void put_signed_pixels_clamped_c(const DCTELEM *block,
491 uint8_t *restrict pixels,
492 int line_size)
493 {
494 int i, j;
495
496 for (i = 0; i < 8; i++) {
497 for (j = 0; j < 8; j++) {
498 if (*block < -128)
499 *pixels = 0;
500 else if (*block > 127)
501 *pixels = 255;
502 else
503 *pixels = (uint8_t)(*block + 128);
504 block++;
505 pixels++;
506 }
507 pixels += (line_size - 8);
508 }
509 }
510
511 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
512 int line_size)
513 {
514 int i;
515 uint8_t *cm = cropTbl + MAX_NEG_CROP;
516
517 /* read the pixels */
518 for(i=0;i<8;i++) {
519 pixels[0] = cm[pixels[0] + block[0]];
520 pixels[1] = cm[pixels[1] + block[1]];
521 pixels[2] = cm[pixels[2] + block[2]];
522 pixels[3] = cm[pixels[3] + block[3]];
523 pixels[4] = cm[pixels[4] + block[4]];
524 pixels[5] = cm[pixels[5] + block[5]];
525 pixels[6] = cm[pixels[6] + block[6]];
526 pixels[7] = cm[pixels[7] + block[7]];
527 pixels += line_size;
528 block += 8;
529 }
530 }
531
532 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
533 int line_size)
534 {
535 int i;
536 uint8_t *cm = cropTbl + MAX_NEG_CROP;
537
538 /* read the pixels */
539 for(i=0;i<4;i++) {
540 pixels[0] = cm[pixels[0] + block[0]];
541 pixels[1] = cm[pixels[1] + block[1]];
542 pixels[2] = cm[pixels[2] + block[2]];
543 pixels[3] = cm[pixels[3] + block[3]];
544 pixels += line_size;
545 block += 8;
546 }
547 }
548
549 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
550 int line_size)
551 {
552 int i;
553 uint8_t *cm = cropTbl + MAX_NEG_CROP;
554
555 /* read the pixels */
556 for(i=0;i<2;i++) {
557 pixels[0] = cm[pixels[0] + block[0]];
558 pixels[1] = cm[pixels[1] + block[1]];
559 pixels += line_size;
560 block += 8;
561 }
562 }
563
564 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
565 {
566 int i;
567 for(i=0;i<8;i++) {
568 pixels[0] += block[0];
569 pixels[1] += block[1];
570 pixels[2] += block[2];
571 pixels[3] += block[3];
572 pixels[4] += block[4];
573 pixels[5] += block[5];
574 pixels[6] += block[6];
575 pixels[7] += block[7];
576 pixels += line_size;
577 block += 8;
578 }
579 }
580
581 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
582 {
583 int i;
584 for(i=0;i<4;i++) {
585 pixels[0] += block[0];
586 pixels[1] += block[1];
587 pixels[2] += block[2];
588 pixels[3] += block[3];
589 pixels += line_size;
590 block += 4;
591 }
592 }
593
594 #if 0
595
596 #define PIXOP2(OPNAME, OP) \
597 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
598 {\
599 int i;\
600 for(i=0; i<h; i++){\
601 OP(*((uint64_t*)block), LD64(pixels));\
602 pixels+=line_size;\
603 block +=line_size;\
604 }\
605 }\
606 \
607 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
608 {\
609 int i;\
610 for(i=0; i<h; i++){\
611 const uint64_t a= LD64(pixels );\
612 const uint64_t b= LD64(pixels+1);\
613 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
614 pixels+=line_size;\
615 block +=line_size;\
616 }\
617 }\
618 \
619 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
620 {\
621 int i;\
622 for(i=0; i<h; i++){\
623 const uint64_t a= LD64(pixels );\
624 const uint64_t b= LD64(pixels+1);\
625 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
626 pixels+=line_size;\
627 block +=line_size;\
628 }\
629 }\
630 \
631 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
632 {\
633 int i;\
634 for(i=0; i<h; i++){\
635 const uint64_t a= LD64(pixels );\
636 const uint64_t b= LD64(pixels+line_size);\
637 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
638 pixels+=line_size;\
639 block +=line_size;\
640 }\
641 }\
642 \
643 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
644 {\
645 int i;\
646 for(i=0; i<h; i++){\
647 const uint64_t a= LD64(pixels );\
648 const uint64_t b= LD64(pixels+line_size);\
649 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
650 pixels+=line_size;\
651 block +=line_size;\
652 }\
653 }\
654 \
655 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
656 {\
657 int i;\
658 const uint64_t a= LD64(pixels );\
659 const uint64_t b= LD64(pixels+1);\
660 uint64_t l0= (a&0x0303030303030303ULL)\
661 + (b&0x0303030303030303ULL)\
662 + 0x0202020202020202ULL;\
663 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
664 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
665 uint64_t l1,h1;\
666 \
667 pixels+=line_size;\
668 for(i=0; i<h; i+=2){\
669 uint64_t a= LD64(pixels );\
670 uint64_t b= LD64(pixels+1);\
671 l1= (a&0x0303030303030303ULL)\
672 + (b&0x0303030303030303ULL);\
673 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
674 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
675 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
676 pixels+=line_size;\
677 block +=line_size;\
678 a= LD64(pixels );\
679 b= LD64(pixels+1);\
680 l0= (a&0x0303030303030303ULL)\
681 + (b&0x0303030303030303ULL)\
682 + 0x0202020202020202ULL;\
683 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
684 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
685 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
686 pixels+=line_size;\
687 block +=line_size;\
688 }\
689 }\
690 \
691 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
692 {\
693 int i;\
694 const uint64_t a= LD64(pixels );\
695 const uint64_t b= LD64(pixels+1);\
696 uint64_t l0= (a&0x0303030303030303ULL)\
697 + (b&0x0303030303030303ULL)\
698 + 0x0101010101010101ULL;\
699 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
700 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
701 uint64_t l1,h1;\
702 \
703 pixels+=line_size;\
704 for(i=0; i<h; i+=2){\
705 uint64_t a= LD64(pixels );\
706 uint64_t b= LD64(pixels+1);\
707 l1= (a&0x0303030303030303ULL)\
708 + (b&0x0303030303030303ULL);\
709 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
710 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
711 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
712 pixels+=line_size;\
713 block +=line_size;\
714 a= LD64(pixels );\
715 b= LD64(pixels+1);\
716 l0= (a&0x0303030303030303ULL)\
717 + (b&0x0303030303030303ULL)\
718 + 0x0101010101010101ULL;\
719 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
720 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
721 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
722 pixels+=line_size;\
723 block +=line_size;\
724 }\
725 }\
726 \
727 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
728 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
729 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
730 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
731 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
733 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
734
735 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
736 #else // 64 bit variant
737
738 #define PIXOP2(OPNAME, OP) \
739 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
740 int i;\
741 for(i=0; i<h; i++){\
742 OP(*((uint16_t*)(block )), LD16(pixels ));\
743 pixels+=line_size;\
744 block +=line_size;\
745 }\
746 }\
747 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
748 int i;\
749 for(i=0; i<h; i++){\
750 OP(*((uint32_t*)(block )), LD32(pixels ));\
751 pixels+=line_size;\
752 block +=line_size;\
753 }\
754 }\
755 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
756 int i;\
757 for(i=0; i<h; i++){\
758 OP(*((uint32_t*)(block )), LD32(pixels ));\
759 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
760 pixels+=line_size;\
761 block +=line_size;\
762 }\
763 }\
764 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
765 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
766 }\
767 \
768 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
769 int src_stride1, int src_stride2, int h){\
770 int i;\
771 for(i=0; i<h; i++){\
772 uint32_t a,b;\
773 a= LD32(&src1[i*src_stride1 ]);\
774 b= LD32(&src2[i*src_stride2 ]);\
775 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
776 a= LD32(&src1[i*src_stride1+4]);\
777 b= LD32(&src2[i*src_stride2+4]);\
778 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
779 }\
780 }\
781 \
782 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
783 int src_stride1, int src_stride2, int h){\
784 int i;\
785 for(i=0; i<h; i++){\
786 uint32_t a,b;\
787 a= LD32(&src1[i*src_stride1 ]);\
788 b= LD32(&src2[i*src_stride2 ]);\
789 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
790 a= LD32(&src1[i*src_stride1+4]);\
791 b= LD32(&src2[i*src_stride2+4]);\
792 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
793 }\
794 }\
795 \
796 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
797 int src_stride1, int src_stride2, int h){\
798 int i;\
799 for(i=0; i<h; i++){\
800 uint32_t a,b;\
801 a= LD32(&src1[i*src_stride1 ]);\
802 b= LD32(&src2[i*src_stride2 ]);\
803 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
804 }\
805 }\
806 \
807 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
808 int src_stride1, int src_stride2, int h){\
809 int i;\
810 for(i=0; i<h; i++){\
811 uint32_t a,b;\
812 a= LD16(&src1[i*src_stride1 ]);\
813 b= LD16(&src2[i*src_stride2 ]);\
814 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
815 }\
816 }\
817 \
818 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
819 int src_stride1, int src_stride2, int h){\
820 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
821 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
822 }\
823 \
824 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
825 int src_stride1, int src_stride2, int h){\
826 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
827 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
828 }\
829 \
830 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
831 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
832 }\
833 \
834 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
835 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
836 }\
837 \
838 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
839 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
840 }\
841 \
842 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
843 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
844 }\
845 \
846 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
847 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
848 int i;\
849 for(i=0; i<h; i++){\
850 uint32_t a, b, c, d, l0, l1, h0, h1;\
851 a= LD32(&src1[i*src_stride1]);\
852 b= LD32(&src2[i*src_stride2]);\
853 c= LD32(&src3[i*src_stride3]);\
854 d= LD32(&src4[i*src_stride4]);\
855 l0= (a&0x03030303UL)\
856 + (b&0x03030303UL)\
857 + 0x02020202UL;\
858 h0= ((a&0xFCFCFCFCUL)>>2)\
859 + ((b&0xFCFCFCFCUL)>>2);\
860 l1= (c&0x03030303UL)\
861 + (d&0x03030303UL);\
862 h1= ((c&0xFCFCFCFCUL)>>2)\
863 + ((d&0xFCFCFCFCUL)>>2);\
864 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
865 a= LD32(&src1[i*src_stride1+4]);\
866 b= LD32(&src2[i*src_stride2+4]);\
867 c= LD32(&src3[i*src_stride3+4]);\
868 d= LD32(&src4[i*src_stride4+4]);\
869 l0= (a&0x03030303UL)\
870 + (b&0x03030303UL)\
871 + 0x02020202UL;\
872 h0= ((a&0xFCFCFCFCUL)>>2)\
873 + ((b&0xFCFCFCFCUL)>>2);\
874 l1= (c&0x03030303UL)\
875 + (d&0x03030303UL);\
876 h1= ((c&0xFCFCFCFCUL)>>2)\
877 + ((d&0xFCFCFCFCUL)>>2);\
878 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
879 }\
880 }\
881 \
882 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
883 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
884 }\
885 \
886 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
887 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
888 }\
889 \
890 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
891 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
892 }\
893 \
894 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
895 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
896 }\
897 \
898 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
899 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
900 int i;\
901 for(i=0; i<h; i++){\
902 uint32_t a, b, c, d, l0, l1, h0, h1;\
903 a= LD32(&src1[i*src_stride1]);\
904 b= LD32(&src2[i*src_stride2]);\
905 c= LD32(&src3[i*src_stride3]);\
906 d= LD32(&src4[i*src_stride4]);\
907 l0= (a&0x03030303UL)\
908 + (b&0x03030303UL)\
909 + 0x01010101UL;\
910 h0= ((a&0xFCFCFCFCUL)>>2)\
911 + ((b&0xFCFCFCFCUL)>>2);\
912 l1= (c&0x03030303UL)\
913 + (d&0x03030303UL);\
914 h1= ((c&0xFCFCFCFCUL)>>2)\
915 + ((d&0xFCFCFCFCUL)>>2);\
916 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
917 a= LD32(&src1[i*src_stride1+4]);\
918 b= LD32(&src2[i*src_stride2+4]);\
919 c= LD32(&src3[i*src_stride3+4]);\
920 d= LD32(&src4[i*src_stride4+4]);\
921 l0= (a&0x03030303UL)\
922 + (b&0x03030303UL)\
923 + 0x01010101UL;\
924 h0= ((a&0xFCFCFCFCUL)>>2)\
925 + ((b&0xFCFCFCFCUL)>>2);\
926 l1= (c&0x03030303UL)\
927 + (d&0x03030303UL);\
928 h1= ((c&0xFCFCFCFCUL)>>2)\
929 + ((d&0xFCFCFCFCUL)>>2);\
930 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
931 }\
932 }\
933 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
934 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
935 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
936 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 }\
938 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
939 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
940 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
941 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
942 }\
943 \
944 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
945 {\
946 int i, a0, b0, a1, b1;\
947 a0= pixels[0];\
948 b0= pixels[1] + 2;\
949 a0 += b0;\
950 b0 += pixels[2];\
951 \
952 pixels+=line_size;\
953 for(i=0; i<h; i+=2){\
954 a1= pixels[0];\
955 b1= pixels[1];\
956 a1 += b1;\
957 b1 += pixels[2];\
958 \
959 block[0]= (a1+a0)>>2; /* FIXME non put */\
960 block[1]= (b1+b0)>>2;\
961 \
962 pixels+=line_size;\
963 block +=line_size;\
964 \
965 a0= pixels[0];\
966 b0= pixels[1] + 2;\
967 a0 += b0;\
968 b0 += pixels[2];\
969 \
970 block[0]= (a1+a0)>>2;\
971 block[1]= (b1+b0)>>2;\
972 pixels+=line_size;\
973 block +=line_size;\
974 }\
975 }\
976 \
977 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
978 {\
979 int i;\
980 const uint32_t a= LD32(pixels );\
981 const uint32_t b= LD32(pixels+1);\
982 uint32_t l0= (a&0x03030303UL)\
983 + (b&0x03030303UL)\
984 + 0x02020202UL;\
985 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
986 + ((b&0xFCFCFCFCUL)>>2);\
987 uint32_t l1,h1;\
988 \
989 pixels+=line_size;\
990 for(i=0; i<h; i+=2){\
991 uint32_t a= LD32(pixels );\
992 uint32_t b= LD32(pixels+1);\
993 l1= (a&0x03030303UL)\
994 + (b&0x03030303UL);\
995 h1= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
997 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
998 pixels+=line_size;\
999 block +=line_size;\
1000 a= LD32(pixels );\
1001 b= LD32(pixels+1);\
1002 l0= (a&0x03030303UL)\
1003 + (b&0x03030303UL)\
1004 + 0x02020202UL;\
1005 h0= ((a&0xFCFCFCFCUL)>>2)\
1006 + ((b&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008 pixels+=line_size;\
1009 block +=line_size;\
1010 }\
1011 }\
1012 \
1013 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1014 {\
1015 int j;\
1016 for(j=0; j<2; j++){\
1017 int i;\
1018 const uint32_t a= LD32(pixels );\
1019 const uint32_t b= LD32(pixels+1);\
1020 uint32_t l0= (a&0x03030303UL)\
1021 + (b&0x03030303UL)\
1022 + 0x02020202UL;\
1023 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1024 + ((b&0xFCFCFCFCUL)>>2);\
1025 uint32_t l1,h1;\
1026 \
1027 pixels+=line_size;\
1028 for(i=0; i<h; i+=2){\
1029 uint32_t a= LD32(pixels );\
1030 uint32_t b= LD32(pixels+1);\
1031 l1= (a&0x03030303UL)\
1032 + (b&0x03030303UL);\
1033 h1= ((a&0xFCFCFCFCUL)>>2)\
1034 + ((b&0xFCFCFCFCUL)>>2);\
1035 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1036 pixels+=line_size;\
1037 block +=line_size;\
1038 a= LD32(pixels );\
1039 b= LD32(pixels+1);\
1040 l0= (a&0x03030303UL)\
1041 + (b&0x03030303UL)\
1042 + 0x02020202UL;\
1043 h0= ((a&0xFCFCFCFCUL)>>2)\
1044 + ((b&0xFCFCFCFCUL)>>2);\
1045 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1046 pixels+=line_size;\
1047 block +=line_size;\
1048 }\
1049 pixels+=4-line_size*(h+1);\
1050 block +=4-line_size*h;\
1051 }\
1052 }\
1053 \
1054 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1055 {\
1056 int j;\
1057 for(j=0; j<2; j++){\
1058 int i;\
1059 const uint32_t a= LD32(pixels );\
1060 const uint32_t b= LD32(pixels+1);\
1061 uint32_t l0= (a&0x03030303UL)\
1062 + (b&0x03030303UL)\
1063 + 0x01010101UL;\
1064 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1065 + ((b&0xFCFCFCFCUL)>>2);\
1066 uint32_t l1,h1;\
1067 \
1068 pixels+=line_size;\
1069 for(i=0; i<h; i+=2){\
1070 uint32_t a= LD32(pixels );\
1071 uint32_t b= LD32(pixels+1);\
1072 l1= (a&0x03030303UL)\
1073 + (b&0x03030303UL);\
1074 h1= ((a&0xFCFCFCFCUL)>>2)\
1075 + ((b&0xFCFCFCFCUL)>>2);\
1076 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1077 pixels+=line_size;\
1078 block +=line_size;\
1079 a= LD32(pixels );\
1080 b= LD32(pixels+1);\
1081 l0= (a&0x03030303UL)\
1082 + (b&0x03030303UL)\
1083 + 0x01010101UL;\
1084 h0= ((a&0xFCFCFCFCUL)>>2)\
1085 + ((b&0xFCFCFCFCUL)>>2);\
1086 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1087 pixels+=line_size;\
1088 block +=line_size;\
1089 }\
1090 pixels+=4-line_size*(h+1);\
1091 block +=4-line_size*h;\
1092 }\
1093 }\
1094 \
1095 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1098 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1099 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1102 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1103
1104 #define op_avg(a, b) a = rnd_avg32(a, b)
1105 #endif
1106 #define op_put(a, b) a = b
1107
1108 PIXOP2(avg, op_avg)
1109 PIXOP2(put, op_put)
1110 #undef op_avg
1111 #undef op_put
1112
1113 #define avg2(a,b) ((a+b+1)>>1)
1114 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1115
1116 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1118 }
1119
1120 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1121 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1122 }
1123
1124 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1125 {
1126 const int A=(16-x16)*(16-y16);
1127 const int B=( x16)*(16-y16);
1128 const int C=(16-x16)*( y16);
1129 const int D=( x16)*( y16);
1130 int i;
1131
1132 for(i=0; i<h; i++)
1133 {
1134 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1135 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1136 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1137 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1138 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1139 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1140 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1141 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1142 dst+= stride;
1143 src+= stride;
1144 }
1145 }
1146
1147 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1148 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1149 {
1150 int y, vx, vy;
1151 const int s= 1<<shift;
1152
1153 width--;
1154 height--;
1155
1156 for(y=0; y<h; y++){
1157 int x;
1158
1159 vx= ox;
1160 vy= oy;
1161 for(x=0; x<8; x++){ //XXX FIXME optimize
1162 int src_x, src_y, frac_x, frac_y, index;
1163
1164 src_x= vx>>16;
1165 src_y= vy>>16;
1166 frac_x= src_x&(s-1);
1167 frac_y= src_y&(s-1);
1168 src_x>>=shift;
1169 src_y>>=shift;
1170
1171 if((unsigned)src_x < width){
1172 if((unsigned)src_y < height){
1173 index= src_x + src_y*stride;
1174 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1175 + src[index +1]* frac_x )*(s-frac_y)
1176 + ( src[index+stride ]*(s-frac_x)
1177 + src[index+stride+1]* frac_x )* frac_y
1178 + r)>>(shift*2);
1179 }else{
1180 index= src_x + clip(src_y, 0, height)*stride;
1181 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1182 + src[index +1]* frac_x )*s
1183 + r)>>(shift*2);
1184 }
1185 }else{
1186 if((unsigned)src_y < height){
1187 index= clip(src_x, 0, width) + src_y*stride;
1188 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1189 + src[index+stride ]* frac_y )*s
1190 + r)>>(shift*2);
1191 }else{
1192 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1193 dst[y*stride + x]= src[index ];
1194 }
1195 }
1196
1197 vx+= dxx;
1198 vy+= dyx;
1199 }
1200 ox += dxy;
1201 oy += dyy;
1202 }
1203 }
1204
1205 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1206 switch(width){
1207 case 2: put_pixels2_c (dst, src, stride, height); break;
1208 case 4: put_pixels4_c (dst, src, stride, height); break;
1209 case 8: put_pixels8_c (dst, src, stride, height); break;
1210 case 16:put_pixels16_c(dst, src, stride, height); break;
1211 }
1212 }
1213
1214 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1215 int i,j;
1216 for (i=0; i < height; i++) {
1217 for (j=0; j < width; j++) {
1218 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1219 }
1220 src += stride;
1221 dst += stride;
1222 }
1223 }
1224
1225 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226 int i,j;
1227 for (i=0; i < height; i++) {
1228 for (j=0; j < width; j++) {
1229 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1230 }
1231 src += stride;
1232 dst += stride;
1233 }
1234 }
1235
1236 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1237 int i,j;
1238 for (i=0; i < height; i++) {
1239 for (j=0; j < width; j++) {
1240 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1241 }
1242 src += stride;
1243 dst += stride;
1244 }
1245 }
1246
1247 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1248 int i,j;
1249 for (i=0; i < height; i++) {
1250 for (j=0; j < width; j++) {
1251 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1252 }
1253 src += stride;
1254 dst += stride;
1255 }
1256 }
1257
1258 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1259 int i,j;
1260 for (i=0; i < height; i++) {
1261 for (j=0; j < width; j++) {
1262 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1263 }
1264 src += stride;
1265 dst += stride;
1266 }
1267 }
1268
1269 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1270 int i,j;
1271 for (i=0; i < height; i++) {
1272 for (j=0; j < width; j++) {
1273 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1274 }
1275 src += stride;
1276 dst += stride;
1277 }
1278 }
1279
1280 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 int i,j;
1282 for (i=0; i < height; i++) {
1283 for (j=0; j < width; j++) {
1284 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1285 }
1286 src += stride;
1287 dst += stride;
1288 }
1289 }
1290
1291 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 int i,j;
1293 for (i=0; i < height; i++) {
1294 for (j=0; j < width; j++) {
1295 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1296 }
1297 src += stride;
1298 dst += stride;
1299 }
1300 }
1301
1302 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303 switch(width){
1304 case 2: avg_pixels2_c (dst, src, stride, height); break;
1305 case 4: avg_pixels4_c (dst, src, stride, height); break;
1306 case 8: avg_pixels8_c (dst, src, stride, height); break;
1307 case 16:avg_pixels16_c(dst, src, stride, height); break;
1308 }
1309 }
1310
1311 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312 int i,j;
1313 for (i=0; i < height; i++) {
1314 for (j=0; j < width; j++) {
1315 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1316 }
1317 src += stride;
1318 dst += stride;
1319 }
1320 }
1321
1322 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 int i,j;
1324 for (i=0; i < height; i++) {
1325 for (j=0; j < width; j++) {
1326 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1327 }
1328 src += stride;
1329 dst += stride;
1330 }
1331 }
1332
1333 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 int i,j;
1335 for (i=0; i < height; i++) {
1336 for (j=0; j < width; j++) {
1337 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1338 }
1339 src += stride;
1340 dst += stride;
1341 }
1342 }
1343
1344 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 int i,j;
1346 for (i=0; i < height; i++) {
1347 for (j=0; j < width; j++) {
1348 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1349 }
1350 src += stride;
1351 dst += stride;
1352 }
1353 }
1354
1355 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 int i,j;
1357 for (i=0; i < height; i++) {
1358 for (j=0; j < width; j++) {
1359 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1360 }
1361 src += stride;
1362 dst += stride;
1363 }
1364 }
1365
1366 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 int i,j;
1368 for (i=0; i < height; i++) {
1369 for (j=0; j < width; j++) {
1370 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1371 }
1372 src += stride;
1373 dst += stride;
1374 }
1375 }
1376
1377 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 int i,j;
1379 for (i=0; i < height; i++) {
1380 for (j=0; j < width; j++) {
1381 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1382 }
1383 src += stride;
1384 dst += stride;
1385 }
1386 }
1387
1388 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 int i,j;
1390 for (i=0; i < height; i++) {
1391 for (j=0; j < width; j++) {
1392 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1393 }
1394 src += stride;
1395 dst += stride;
1396 }
1397 }
1398 #if 0
1399 #define TPEL_WIDTH(width)\
1400 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1402 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1404 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1406 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1408 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1410 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1412 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1414 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1415 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1416 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1418 #endif
1419
1420 #define H264_CHROMA_MC(OPNAME, OP)\
1421 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1422 const int A=(8-x)*(8-y);\
1423 const int B=( x)*(8-y);\
1424 const int C=(8-x)*( y);\
1425 const int D=( x)*( y);\
1426 int i;\
1427 \
1428 assert(x<8 && y<8 && x>=0 && y>=0);\
1429 \
1430 for(i=0; i<h; i++)\
1431 {\
1432 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1433 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1434 dst+= stride;\
1435 src+= stride;\
1436 }\
1437 }\
1438 \
1439 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1440 const int A=(8-x)*(8-y);\
1441 const int B=( x)*(8-y);\
1442 const int C=(8-x)*( y);\
1443 const int D=( x)*( y);\
1444 int i;\
1445 \
1446 assert(x<8 && y<8 && x>=0 && y>=0);\
1447 \
1448 for(i=0; i<h; i++)\
1449 {\
1450 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1451 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1452 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1453 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1454 dst+= stride;\
1455 src+= stride;\
1456 }\
1457 }\
1458 \
1459 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1460 const int A=(8-x)*(8-y);\
1461 const int B=( x)*(8-y);\
1462 const int C=(8-x)*( y);\
1463 const int D=( x)*( y);\
1464 int i;\
1465 \
1466 assert(x<8 && y<8 && x>=0 && y>=0);\
1467 \
1468 for(i=0; i<h; i++)\
1469 {\
1470 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1471 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1472 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1473 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1474 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1475 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1476 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1477 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1478 dst+= stride;\
1479 src+= stride;\
1480 }\
1481 }
1482
1483 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1484 #define op_put(a, b) a = (((b) + 32)>>6)
1485
1486 H264_CHROMA_MC(put_ , op_put)
1487 H264_CHROMA_MC(avg_ , op_avg)
1488 #undef op_avg
1489 #undef op_put
1490
1491 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1492 {
1493 int i;
1494 for(i=0; i<h; i++)
1495 {
1496 ST16(dst , LD16(src ));
1497 dst+=dstStride;
1498 src+=srcStride;
1499 }
1500 }
1501
1502 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1503 {
1504 int i;
1505 for(i=0; i<h; i++)
1506 {
1507 ST32(dst , LD32(src ));
1508 dst+=dstStride;
1509 src+=srcStride;
1510 }
1511 }
1512
1513 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1514 {
1515 int i;
1516 for(i=0; i<h; i++)
1517 {
1518 ST32(dst , LD32(src ));
1519 ST32(dst+4 , LD32(src+4 ));
1520 dst+=dstStride;
1521 src+=srcStride;
1522 }
1523 }
1524
1525 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1526 {
1527 int i;
1528 for(i=0; i<h; i++)
1529 {
1530 ST32(dst , LD32(src ));
1531 ST32(dst+4 , LD32(src+4 ));
1532 ST32(dst+8 , LD32(src+8 ));
1533 ST32(dst+12, LD32(src+12));
1534 dst+=dstStride;
1535 src+=srcStride;
1536 }
1537 }
1538
1539 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1540 {
1541 int i;
1542 for(i=0; i<h; i++)
1543 {
1544 ST32(dst , LD32(src ));
1545 ST32(dst+4 , LD32(src+4 ));
1546 ST32(dst+8 , LD32(src+8 ));
1547 ST32(dst+12, LD32(src+12));
1548 dst[16]= src[16];
1549 dst+=dstStride;
1550 src+=srcStride;
1551 }
1552 }
1553
1554 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1555 {
1556 int i;
1557 for(i=0; i<h; i++)
1558 {
1559 ST32(dst , LD32(src ));
1560 ST32(dst+4 , LD32(src+4 ));
1561 dst[8]= src[8];
1562 dst+=dstStride;
1563 src+=srcStride;
1564 }
1565 }
1566
1567
1568 #define QPEL_MC(r, OPNAME, RND, OP) \
1569 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1570 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1571 int i;\
1572 for(i=0; i<h; i++)\
1573 {\
1574 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1575 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1576 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1577 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1578 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1579 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1580 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1581 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1582 dst+=dstStride;\
1583 src+=srcStride;\
1584 }\
1585 }\
1586 \
1587 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1588 const int w=8;\
1589 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1590 int i;\
1591 for(i=0; i<w; i++)\
1592 {\
1593 const int src0= src[0*srcStride];\
1594 const int src1= src[1*srcStride];\
1595 const int src2= src[2*srcStride];\
1596 const int src3= src[3*srcStride];\
1597 const int src4= src[4*srcStride];\
1598 const int src5= src[5*srcStride];\
1599 const int src6= src[6*srcStride];\
1600 const int src7= src[7*srcStride];\
1601 const int src8= src[8*srcStride];\
1602 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1603 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1604 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1605 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1606 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1607 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1608 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1609 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1610 dst++;\
1611 src++;\
1612 }\
1613 }\
1614 \
1615 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1616 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1617 int i;\
1618 \
1619 for(i=0; i<h; i++)\
1620 {\
1621 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1622 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1623 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1624 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1625 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1626 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1627 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1628 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1629 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1630 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1631 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1632 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1633 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1634 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1635 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1636 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1637 dst+=dstStride;\
1638 src+=srcStride;\
1639 }\
1640 }\
1641 \
1642 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1643 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1644 int i;\
1645 const int w=16;\
1646 for(i=0; i<w; i++)\
1647 {\
1648 const int src0= src[0*srcStride];\
1649 const int src1= src[1*srcStride];\
1650 const int src2= src[2*srcStride];\
1651 const int src3= src[3*srcStride];\
1652 const int src4= src[4*srcStride];\
1653 const int src5= src[5*srcStride];\
1654 const int src6= src[6*srcStride];\
1655 const int src7= src[7*srcStride];\
1656 const int src8= src[8*srcStride];\
1657 const int src9= src[9*srcStride];\
1658 const int src10= src[10*srcStride];\
1659 const int src11= src[11*srcStride];\
1660 const int src12= src[12*srcStride];\
1661 const int src13= src[13*srcStride];\
1662 const int src14= src[14*srcStride];\
1663 const int src15= src[15*srcStride];\
1664 const int src16= src[16*srcStride];\
1665 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1666 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1667 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1668 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1669 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1670 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1671 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1672 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1673 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1674 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1675 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1676 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1677 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1678 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1679 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1680 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1681 dst++;\
1682 src++;\
1683 }\
1684 }\
1685 \
1686 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1687 OPNAME ## pixels8_c(dst, src, stride, 8);\
1688 }\
1689 \
1690 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1691 uint8_t half[64];\
1692 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1693 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1694 }\
1695 \
1696 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1697 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1698 }\
1699 \
1700 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1701 uint8_t half[64];\
1702 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1703 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1704 }\
1705 \
1706 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1707 uint8_t full[16*9];\
1708 uint8_t half[64];\
1709 copy_block9(full, src, 16, stride, 9);\
1710 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1711 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1712 }\
1713 \
1714 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1715 uint8_t full[16*9];\
1716 copy_block9(full, src, 16, stride, 9);\
1717 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1718 }\
1719 \
1720 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1721 uint8_t full[16*9];\
1722 uint8_t half[64];\
1723 copy_block9(full, src, 16, stride, 9);\
1724 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1725 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1726 }\
1727 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1728 uint8_t full[16*9];\
1729 uint8_t halfH[72];\
1730 uint8_t halfV[64];\
1731 uint8_t halfHV[64];\
1732 copy_block9(full, src, 16, stride, 9);\
1733 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1734 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1735 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1737 }\
1738 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1739 uint8_t full[16*9];\
1740 uint8_t halfH[72];\
1741 uint8_t halfHV[64];\
1742 copy_block9(full, src, 16, stride, 9);\
1743 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1744 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1745 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1746 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1747 }\
1748 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1749 uint8_t full[16*9];\
1750 uint8_t halfH[72];\
1751 uint8_t halfV[64];\
1752 uint8_t halfHV[64];\
1753 copy_block9(full, src, 16, stride, 9);\
1754 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1755 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1756 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1757 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1758 }\
1759 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1760 uint8_t full[16*9];\
1761 uint8_t halfH[72];\
1762 uint8_t halfHV[64];\
1763 copy_block9(full, src, 16, stride, 9);\
1764 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1765 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1766 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1768 }\
1769 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t full[16*9];\
1771 uint8_t halfH[72];\
1772 uint8_t halfV[64];\
1773 uint8_t halfHV[64];\
1774 copy_block9(full, src, 16, stride, 9);\
1775 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1776 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1777 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1778 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1779 }\
1780 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1781 uint8_t full[16*9];\
1782 uint8_t halfH[72];\
1783 uint8_t halfHV[64];\
1784 copy_block9(full, src, 16, stride, 9);\
1785 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1787 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1789 }\
1790 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1791 uint8_t full[16*9];\
1792 uint8_t halfH[72];\
1793 uint8_t halfV[64];\
1794 uint8_t halfHV[64];\
1795 copy_block9(full, src, 16, stride, 9);\
1796 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1797 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1798 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1799 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1800 }\
1801 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1803 uint8_t halfH[72];\
1804 uint8_t halfHV[64];\
1805 copy_block9(full, src, 16, stride, 9);\
1806 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1810 }\
1811 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1812 uint8_t halfH[72];\
1813 uint8_t halfHV[64];\
1814 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1815 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1816 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1817 }\
1818 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1819 uint8_t halfH[72];\
1820 uint8_t halfHV[64];\
1821 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1822 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1823 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1824 }\
1825 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1826 uint8_t full[16*9];\
1827 uint8_t halfH[72];\
1828 uint8_t halfV[64];\
1829 uint8_t halfHV[64];\
1830 copy_block9(full, src, 16, stride, 9);\
1831 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1832 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1833 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1834 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1835 }\
1836 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t full[16*9];\
1838 uint8_t halfH[72];\
1839 copy_block9(full, src, 16, stride, 9);\
1840 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1841 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1842 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1843 }\
1844 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1845 uint8_t full[16*9];\
1846 uint8_t halfH[72];\
1847 uint8_t halfV[64];\
1848 uint8_t halfHV[64];\
1849 copy_block9(full, src, 16, stride, 9);\
1850 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1851 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1852 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1853 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1854 }\
1855 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1856 uint8_t full[16*9];\
1857 uint8_t halfH[72];\
1858 copy_block9(full, src, 16, stride, 9);\
1859 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1861 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1862 }\
1863 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1864 uint8_t halfH[72];\
1865 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1866 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1867 }\
1868 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1869 OPNAME ## pixels16_c(dst, src, stride, 16);\
1870 }\
1871 \
1872 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1873 uint8_t half[256];\
1874 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1875 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1876 }\
1877 \
1878 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1879 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1880 }\
1881 \
1882 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1883 uint8_t half[256];\
1884 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1885 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1886 }\
1887 \
1888 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1889 uint8_t full[24*17];\
1890 uint8_t half[256];\
1891 copy_block17(full, src, 24, stride, 17);\
1892 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1893 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1894 }\
1895 \
1896 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1897 uint8_t full[24*17];\
1898 copy_block17(full, src, 24, stride, 17);\
1899 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1900 }\
1901 \
1902 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1903 uint8_t full[24*17];\
1904 uint8_t half[256];\
1905 copy_block17(full, src, 24, stride, 17);\
1906 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1907 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1908 }\
1909 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1910 uint8_t full[24*17];\
1911 uint8_t halfH[272];\
1912 uint8_t halfV[256];\
1913 uint8_t halfHV[256];\
1914 copy_block17(full, src, 24, stride, 17);\
1915 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1916 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1917 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1919 }\
1920 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1921 uint8_t full[24*17];\
1922 uint8_t halfH[272];\
1923 uint8_t halfHV[256];\
1924 copy_block17(full, src, 24, stride, 17);\
1925 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1926 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1927 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1928 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1929 }\
1930 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1931 uint8_t full[24*17];\
1932 uint8_t halfH[272];\
1933 uint8_t halfV[256];\
1934 uint8_t halfHV[256];\
1935 copy_block17(full, src, 24, stride, 17);\
1936 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1937 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1938 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1939 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1940 }\
1941 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1942 uint8_t full[24*17];\
1943 uint8_t halfH[272];\
1944 uint8_t halfHV[256];\
1945 copy_block17(full, src, 24, stride, 17);\
1946 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1947 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1948 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1950 }\
1951 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1952 uint8_t full[24*17];\
1953 uint8_t halfH[272];\
1954 uint8_t halfV[256];\
1955 uint8_t halfHV[256];\
1956 copy_block17(full, src, 24, stride, 17);\
1957 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1958 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1959 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1960 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1961 }\
1962 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1963 uint8_t full[24*17];\
1964 uint8_t halfH[272];\
1965 uint8_t halfHV[256];\
1966 copy_block17(full, src, 24, stride, 17);\
1967 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1969 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1971 }\
1972 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1973 uint8_t full[24*17];\
1974 uint8_t halfH[272];\
1975 uint8_t halfV[256];\
1976 uint8_t halfHV[256];\
1977 copy_block17(full, src, 24, stride, 17);\
1978 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1979 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1980 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1981 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1982 }\
1983 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t full[24*17];\
1985 uint8_t halfH[272];\
1986 uint8_t halfHV[256];\
1987 copy_block17(full, src, 24, stride, 17);\
1988 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1992 }\
1993 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t halfH[272];\
1995 uint8_t halfHV[256];\
1996 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1997 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1998 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1999 }\
2000 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2001 uint8_t halfH[272];\
2002 uint8_t halfHV[256];\
2003 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2004 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2005 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2006 }\
2007 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2008 uint8_t full[24*17];\
2009 uint8_t halfH[272];\
2010 uint8_t halfV[256];\
2011 uint8_t halfHV[256];\
2012 copy_block17(full, src, 24, stride, 17);\
2013 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2014 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2015 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2016 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2017 }\
2018 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2019 uint8_t full[24*17];\
2020 uint8_t halfH[272];\
2021 copy_block17(full, src, 24, stride, 17);\
2022 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2023 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2024 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2025 }\
2026 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2027 uint8_t full[24*17];\
2028 uint8_t halfH[272];\
2029 uint8_t halfV[256];\
2030 uint8_t halfHV[256];\
2031 copy_block17(full, src, 24, stride, 17);\
2032 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2033 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2034 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2035 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2036 }\
2037 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2038 uint8_t full[24*17];\
2039 uint8_t halfH[272];\
2040 copy_block17(full, src, 24, stride, 17);\
2041 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2043 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2044 }\
2045 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2046 uint8_t halfH[272];\
2047 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2048 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2049 }
2050
2051 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2052 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2053 #define op_put(a, b) a = cm[((b) + 16)>>5]
2054 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2055
2056 QPEL_MC(0, put_ , _ , op_put)
2057 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2058 QPEL_MC(0, avg_ , _ , op_avg)
2059 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2060 #undef op_avg
2061 #undef op_avg_no_rnd
2062 #undef op_put
2063 #undef op_put_no_rnd
2064
2065 #if 1
2066 #define H264_LOWPASS(OPNAME, OP, OP2) \
2067 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2068 const int h=2;\
2069 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2070 int i;\
2071 for(i=0; i<h; i++)\
2072 {\
2073 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2074 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2075 dst+=dstStride;\
2076 src+=srcStride;\
2077 }\
2078 }\
2079 \
2080 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2081 const int w=2;\
2082 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2083 int i;\
2084 for(i=0; i<w; i++)\
2085 {\
2086 const int srcB= src[-2*srcStride];\
2087 const int srcA= src[-1*srcStride];\
2088 const int src0= src[0 *srcStride];\
2089 const int src1= src[1 *srcStride];\
2090 const int src2= src[2 *srcStride];\
2091 const int src3= src[3 *srcStride];\
2092 const int src4= src[4 *srcStride];\
2093 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2094 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2095 dst++;\
2096 src++;\
2097 }\
2098 }\
2099 \
2100 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2101 const int h=2;\
2102 const int w=2;\
2103 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2104 int i;\
2105 src -= 2*srcStride;\
2106 for(i=0; i<h+5; i++)\
2107 {\
2108 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2109 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2110 tmp+=tmpStride;\
2111 src+=srcStride;\
2112 }\
2113 tmp -= tmpStride*(h+5-2);\
2114 for(i=0; i<w; i++)\
2115 {\
2116 const int tmpB= tmp[-2*tmpStride];\
2117 const int tmpA= tmp[-1*tmpStride];\
2118 const int tmp0= tmp[0 *tmpStride];\
2119 const int tmp1= tmp[1 *tmpStride];\
2120 const int tmp2= tmp[2 *tmpStride];\
2121 const int tmp3= tmp[3 *tmpStride];\
2122 const int tmp4= tmp[4 *tmpStride];\
2123 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2124 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2125 dst++;\
2126 tmp++;\
2127 }\
2128 }\
2129 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2130 const int h=4;\
2131 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2132 int i;\
2133 for(i=0; i<h; i++)\
2134 {\
2135 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2136 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2137 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2138 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2139 dst+=dstStride;\
2140 src+=srcStride;\
2141 }\
2142 }\
2143 \
2144 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2145 const int w=4;\
2146 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2147 int i;\
2148 for(i=0; i<w; i++)\
2149 {\
2150 const int srcB= src[-2*srcStride];\
2151 const int srcA= src[-1*srcStride];\
2152 const int src0= src[0 *srcStride];\
2153 const int src1= src[1 *srcStride];\
2154 const int src2= src[2 *srcStride];\
2155 const int src3= src[3 *srcStride];\
2156 const int src4= src[4 *srcStride];\
2157 const int src5= src[5 *srcStride];\
2158 const int src6= src[6 *srcStride];\
2159 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2160 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2161 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2162 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2163 dst++;\
2164 src++;\
2165 }\
2166 }\
2167 \
2168 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2169 const int h=4;\
2170 const int w=4;\
2171 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2172 int i;\
2173 src -= 2*srcStride;\
2174 for(i=0; i<h+5; i++)\
2175 {\
2176 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2177 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2178 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2179 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2180 tmp+=tmpStride;\
2181 src+=srcStride;\
2182 }\
2183 tmp -= tmpStride*(h+5-2);\
2184 for(i=0; i<w; i++)\
2185 {\
2186 const int tmpB= tmp[-2*tmpStride];\
2187 const int tmpA= tmp[-1*tmpStride];\
2188 const int tmp0= tmp[0 *tmpStride];\
2189 const int tmp1= tmp[1 *tmpStride];\
2190 const int tmp2= tmp[2 *tmpStride];\
2191 const int tmp3= tmp[3 *tmpStride];\
2192 const int tmp4= tmp[4 *tmpStride];\
2193 const int tmp5= tmp[5 *tmpStride];\
2194 const int tmp6= tmp[6 *tmpStride];\
2195 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2196 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2197 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2198 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2199 dst++;\
2200 tmp++;\
2201 }\
2202 }\
2203 \
2204 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205 const int h=8;\
2206 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2207 int i;\
2208 for(i=0; i<h; i++)\
2209 {\
2210 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2211 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2212 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2213 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2214 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2215 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2216 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2217 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2218 dst+=dstStride;\
2219 src+=srcStride;\
2220 }\
2221 }\
2222 \
2223 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2224 const int w=8;\
2225 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2226 int i;\
2227 for(i=0; i<w; i++)\
2228 {\
2229 const int srcB= src[-2*srcStride];\
2230 const int srcA= src[-1*srcStride];\
2231 const int src0= src[0 *srcStride];\
2232 const int src1= src[1 *srcStride];\
2233 const int src2= src[2 *srcStride];\
2234 const int src3= src[3 *srcStride];\
2235 const int src4= src[4 *srcStride];\
2236 const int src5= src[5 *srcStride];\
2237 const int src6= src[6 *srcStride];\
2238 const int src7= src[7 *srcStride];\
2239 const int src8= src[8 *srcStride];\
2240 const int src9= src[9 *srcStride];\
2241 const int src10=src[10*srcStride];\
2242 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2243 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2244 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2245 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2246 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2247 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2248 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2249 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2250 dst++;\
2251 src++;\
2252 }\
2253 }\
2254 \
2255 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2256 const int h=8;\
2257 const int w=8;\
2258 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2259 int i;\
2260 src -= 2*srcStride;\
2261 for(i=0; i<h+5; i++)\
2262 {\
2263 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2264 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2265 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2266 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2267 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2268 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2269 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2270 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2271 tmp+=tmpStride;\
2272 src+=srcStride;\
2273 }\
2274 tmp -= tmpStride*(h+5-2);\
2275 for(i=0; i<w; i++)\
2276 {\
2277 const int tmpB= tmp[-2*tmpStride];\
2278 const int tmpA= tmp[-1*tmpStride];\
2279 const int tmp0= tmp[0 *tmpStride];\
2280 const int tmp1= tmp[1 *tmpStride];\
2281 const int tmp2= tmp[2 *tmpStride];\
2282 const int tmp3= tmp[3 *tmpStride];\
2283 const int tmp4= tmp[4 *tmpStride];\
2284 const int tmp5= tmp[5 *tmpStride];\
2285 const int tmp6= tmp[6 *tmpStride];\
2286 const int tmp7= tmp[7 *tmpStride];\
2287 const int tmp8= tmp[8 *tmpStride];\
2288 const int tmp9= tmp[9 *tmpStride];\
2289 const int tmp10=tmp[10*tmpStride];\
2290 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2291 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2292 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2293 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2294 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2295 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2296 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2297 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2298 dst++;\
2299 tmp++;\
2300 }\
2301 }\
2302 \
2303 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2304 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2305 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306 src += 8*srcStride;\
2307 dst += 8*dstStride;\
2308 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2309 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2310 }\
2311 \
2312 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2313 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2314 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315 src += 8*srcStride;\
2316 dst += 8*dstStride;\
2317 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2318 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2319 }\
2320 \
2321 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2322 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2323 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324 src += 8*srcStride;\
2325 dst += 8*dstStride;\
2326 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2327 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2328 }\
2329
2330 #define H264_MC(OPNAME, SIZE) \
2331 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2332 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2333 }\
2334 \
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2336 uint8_t half[SIZE*SIZE];\
2337 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2338 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2339 }\
2340 \
2341 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2342 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2343 }\
2344 \
2345 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2346 uint8_t half[SIZE*SIZE];\
2347 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2348 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2349 }\
2350 \
2351 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2352 uint8_t full[SIZE*(SIZE+5)];\
2353 uint8_t * const full_mid= full + SIZE*2;\
2354 uint8_t half[SIZE*SIZE];\
2355 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2356 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2357 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2358 }\
2359 \
2360 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2361 uint8_t full[SIZE*(SIZE+5)];\
2362 uint8_t * const full_mid= full + SIZE*2;\
2363 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2364 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2365 }\
2366 \
2367 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2368 uint8_t full[SIZE*(SIZE+5)];\
2369 uint8_t * const full_mid= full + SIZE*2;\
2370 uint8_t half[SIZE*SIZE];\
2371 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2372 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2373 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2374 }\
2375 \
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2377 uint8_t full[SIZE*(SIZE+5)];\
2378 uint8_t * const full_mid= full + SIZE*2;\
2379 uint8_t halfH[SIZE*SIZE];\
2380 uint8_t halfV[SIZE*SIZE];\
2381 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2382 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2383 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2384 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2385 }\
2386 \
2387 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2388 uint8_t full[SIZE*(SIZE+5)];\
2389 uint8_t * const full_mid= full + SIZE*2;\
2390 uint8_t halfH[SIZE*SIZE];\
2391 uint8_t halfV[SIZE*SIZE];\
2392 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2393 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2394 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2395 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2396 }\
2397 \
2398 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2399 uint8_t full[SIZE*(SIZE+5)];\
2400 uint8_t * const full_mid= full + SIZE*2;\
2401 uint8_t halfH[SIZE*SIZE];\
2402 uint8_t halfV[SIZE*SIZE];\
2403 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2404 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2405 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2406 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2407 }\
2408 \
2409 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2410 uint8_t full[SIZE*(SIZE+5)];\
2411 uint8_t * const full_mid= full + SIZE*2;\
2412 uint8_t halfH[SIZE*SIZE];\
2413 uint8_t halfV[SIZE*SIZE];\
2414 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2415 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2416 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2417 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2418 }\
2419 \
2420 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2421 int16_t tmp[SIZE*(SIZE+5)];\
2422 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2423 }\
2424 \
2425 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2426 int16_t tmp[SIZE*(SIZE+5)];\
2427 uint8_t halfH[SIZE*SIZE];\
2428 uint8_t halfHV[SIZE*SIZE];\
2429 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2430 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2431 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2432 }\
2433 \
2434 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2435 int16_t tmp[SIZE*(SIZE+5)];\
2436 uint8_t halfH[SIZE*SIZE];\
2437 uint8_t halfHV[SIZE*SIZE];\
2438 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2440 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2441 }\
2442 \
2443 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2444 uint8_t full[SIZE*(SIZE+5)];\
2445 uint8_t * const full_mid= full + SIZE*2;\
2446 int16_t tmp[SIZE*(SIZE+5)];\
2447 uint8_t halfV[SIZE*SIZE];\
2448 uint8_t halfHV[SIZE*SIZE];\
2449 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2450 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2451 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2452 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2453 }\
2454 \
2455 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2456 uint8_t full[SIZE*(SIZE+5)];\
2457 uint8_t * const full_mid= full + SIZE*2;\
2458 int16_t tmp[SIZE*(SIZE+5)];\
2459 uint8_t halfV[SIZE*SIZE];\
2460 uint8_t halfHV[SIZE*SIZE];\
2461 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2462 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2463 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2464 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2465 }\
2466
2467 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2468 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2469 #define op_put(a, b) a = cm[((b) + 16)>>5]
2470 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2471 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2472
2473 H264_LOWPASS(put_ , op_put, op2_put)
2474 H264_LOWPASS(avg_ , op_avg, op2_avg)
2475 H264_MC(put_, 2)
2476 H264_MC(put_, 4)
2477 H264_MC(put_, 8)
2478 H264_MC(put_, 16)
2479 H264_MC(avg_, 4)
2480 H264_MC(avg_, 8)
2481 H264_MC(avg_, 16)
2482
2483 #undef op_avg
2484 #undef op_put
2485 #undef op2_avg
2486 #undef op2_put
2487 #endif
2488
2489 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2490 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2491 #define H264_WEIGHT(W,H) \
2492 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2493 int y; \
2494 offset <<= log2_denom; \
2495 if(log2_denom) offset += 1<<(log2_denom-1); \
2496 for(y=0; y<H; y++, block += stride){ \
2497 op_scale1(0); \
2498 op_scale1(1); \
2499 if(W==2) continue; \
2500 op_scale1(2); \
2501 op_scale1(3); \
2502 if(W==4) continue; \
2503 op_scale1(4); \
2504 op_scale1(5); \
2505 op_scale1(6); \
2506 op_scale1(7); \
2507 if(W==8) continue; \
2508 op_scale1(8); \
2509 op_scale1(9); \
2510 op_scale1(10); \
2511 op_scale1(11); \
2512 op_scale1(12); \
2513 op_scale1(13); \
2514 op_scale1(14); \
2515 op_scale1(15); \
2516 } \
2517 } \
2518 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2519 int y; \
2520 offset = ((offset + 1) | 1) << log2_denom; \
2521 for(y=0; y<H; y++, dst += stride, src += stride){ \
2522 op_scale2(0); \
2523 op_scale2(1); \
2524 if(W==2) continue; \
2525 op_scale2(2); \
2526 op_scale2(3); \
2527 if(W==4) continue; \
2528 op_scale2(4); \
2529 op_scale2(5); \
2530 op_scale2(6); \
2531 op_scale2(7); \
2532 if(W==8) continue; \
2533 op_scale2(8); \
2534 op_scale2(9); \
2535 op_scale2(10); \
2536 op_scale2(11); \
2537 op_scale2(12); \
2538 op_scale2(13); \
2539 op_scale2(14); \
2540 op_scale2(15); \
2541 } \
2542 }
2543
2544 H264_WEIGHT(16,16)
2545 H264_WEIGHT(16,8)
2546 H264_WEIGHT(8,16)
2547 H264_WEIGHT(8,8)
2548 H264_WEIGHT(8,4)
2549 H264_WEIGHT(4,8)
2550 H264_WEIGHT(4,4)
2551 H264_WEIGHT(4,2)
2552 H264_WEIGHT(2,4)
2553 H264_WEIGHT(2,2)
2554
2555 #undef op_scale1
2556 #undef op_scale2
2557 #undef H264_WEIGHT
2558
2559 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2560 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2561 int i;
2562
2563 for(i=0; i<h; i++){
2564 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2565 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2566 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2567 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2568 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2569 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2570 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2571 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2572 dst+=dstStride;
2573 src+=srcStride;
2574 }
2575 }
2576
2577 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2578 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2579 int i;
2580
2581 for(i=0; i<w; i++){
2582 const int src_1= src[ -srcStride];
2583 const int src0 = src[0 ];
2584 const int src1 = src[ srcStride];
2585 const int src2 = src[2*srcStride];
2586 const int src3 = src[3*srcStride];
2587 const int src4 = src[4*srcStride];
2588 const int src5 = src[5*srcStride];
2589 const int src6 = src[6*srcStride];
2590 const int src7 = src[7*srcStride];
2591 const int src8 = src[8*srcStride];
2592 const int src9 = src[9*srcStride];
2593 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2594 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2595 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2596 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2597 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2598 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2599 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2600 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2601 src++;
2602 dst++;
2603 }
2604 }
2605
2606 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2607 put_pixels8_c(dst, src, stride, 8);
2608 }
2609
2610 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2611 uint8_t half[64];
2612 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2613 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2614 }
2615
2616 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2617 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2618 }
2619
2620 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2621 uint8_t half[64];
2622 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2623 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2624 }
2625
2626 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2627 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2628 }
2629
2630 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2631 uint8_t halfH[88];
2632 uint8_t halfV[64];
2633 uint8_t halfHV[64];
2634 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2635 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2636 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2637 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2638 }
2639 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2640 uint8_t halfH[88];
2641 uint8_t halfV[64];
2642 uint8_t halfHV[64];
2643 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2644 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2645 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2646 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2647 }
2648 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2649 uint8_t halfH[88];
2650 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2651 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2652 }
2653
2654 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2655 int x;
2656 const int strength= ff_h263_loop_filter_strength[qscale];
2657
2658 for(x=0; x<8; x++){
2659 int d1, d2, ad1;
2660 int p0= src[x-2*stride];
2661 int p1= src[x-1*stride];
2662 int p2= src[x+0*stride];
2663 int p3= src[x+1*stride];
2664 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2665
2666 if (d<-2*strength) d1= 0;
2667 else if(d<- strength) d1=-2*strength - d;
2668 else if(d< strength) d1= d;
2669 else if(d< 2*strength) d1= 2*strength - d;
2670 else d1= 0;
2671
2672 p1 += d1;
2673 p2 -= d1;
2674 if(p1&256) p1= ~(p1>>31);
2675 if(p2&256) p2= ~(p2>>31);
2676
2677 src[x-1*stride] = p1;
2678 src[x+0*stride] = p2;
2679
2680 ad1= ABS(d1)>>1;
2681
2682 d2= clip((p0-p3)/4, -ad1, ad1);
2683
2684 src[x-2*stride] = p0 - d2;
2685 src[x+ stride] = p3 + d2;
2686 }
2687 }
2688
2689 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2690 int y;
2691 const int strength= ff_h263_loop_filter_strength[qscale];
2692
2693 for(y=0; y<8; y++){
2694 int d1, d2, ad1;
2695 int p0= src[y*stride-2];
2696 int p1= src[y*stride-1];
2697 int p2= src[y*stride+0];
2698 int p3= src[y*stride+1];
2699 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2700
2701 if (d<-2*strength) d1= 0;
2702 else if(d<- strength) d1=-2*strength - d;
2703 else if(d< strength) d1= d;
2704 else if(d< 2*strength) d1= 2*strength - d;
2705 else d1= 0;
2706
2707 p1 += d1;
2708 p2 -= d1;
2709 if(p1&256) p1= ~(p1>>31);
2710 if(p2&256) p2= ~(p2>>31);
2711
2712 src[y*stride-1] = p1;
2713 src[y*stride+0] = p2;
2714
2715 ad1= ABS(d1)>>1;
2716
2717 d2= clip((p0-p3)/4, -ad1, ad1);
2718
2719 src[y*stride-2] = p0 - d2;
2720 src[y*stride+1] = p3 + d2;
2721 }
2722 }
2723
2724 static void h261_loop_filter_c(uint8_t *src, int stride){
2725 int x,y,xy,yz;
2726 int temp[64];
2727
2728 for(x=0; x<8; x++){
2729 temp[x ] = 4*src[x ];
2730 temp[x + 7*8] = 4*src[x + 7*stride];
2731 }
2732 for(y=1; y<7; y++){
2733 for(x=0; x<8; x++){
2734 xy = y * stride + x;
2735 yz = y * 8 + x;
2736 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2737 }
2738 }
2739
2740 for(y=0; y<8; y++){
2741 src[ y*stride] = (temp[ y*8] + 2)>>2;
2742 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2743 for(x=1; x<7; x++){
2744 xy = y * stride + x;
2745 yz = y * 8 + x;
2746 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2747 }
2748 }
2749 }
2750
2751 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2752 {
2753 int i, d;
2754 for( i = 0; i < 4; i++ ) {
2755 if( tc0[i] < 0 ) {
2756 pix += 4*ystride;
2757 continue;
2758 }
2759 for( d = 0; d < 4; d++ ) {
2760 const int p0 = pix[-1*xstride];
2761 const int p1 = pix[-2*xstride];
2762 const int p2 = pix[-3*xstride];
2763 const int q0 = pix[0];
2764 const int q1 = pix[1*xstride];
2765 const int q2 = pix[2*xstride];
2766
2767 if( ABS( p0 - q0 ) < alpha &&
2768 ABS( p1 - p0 ) < beta &&
2769 ABS( q1 - q0 ) < beta ) {
2770
2771 int tc = tc0[i];
2772 int i_delta;
2773
2774 if( ABS( p2 - p0 ) < beta ) {
2775 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2776 tc++;
2777 }
2778 if( ABS( q2 - q0 ) < beta ) {
2779 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2780 tc++;
2781 }
2782
2783 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2784 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2785 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2786 }
2787 pix += ystride;
2788 }
2789 }
2790 }
2791 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2792 {
2793 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2794 }
2795 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2796 {
2797 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2798 }
2799
2800 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2801 {
2802 int i, d;
2803 for( i = 0; i < 4; i++ ) {
2804 const int tc = tc0[i];
2805 if( tc <= 0 ) {
2806 pix += 2*ystride;
2807 continue;
2808 }
2809 for( d = 0; d < 2; d++ ) {
2810 const int p0 = pix[-1*xstride];
2811 const int p1 = pix[-2*xstride];
2812 const int q0 = pix[0];
2813 const int q1 = pix[1*xstride];
2814
2815 if( ABS( p0 - q0 ) < alpha &&
2816 ABS( p1 - p0 ) < beta &&
2817 ABS( q1 - q0 ) < beta ) {
2818
2819 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2820
2821 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2822 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2823 }
2824 pix += ystride;
2825 }
2826 }
2827 }
2828 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2829 {
2830 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2831 }
2832 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2833 {
2834 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2835 }
2836
2837 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2838 {
2839 int d;
2840 for( d = 0; d < 8; d++ ) {
2841 const int p0 = pix[-1*xstride];
2842 const int p1 = pix[-2*xstride];
2843 const int q0 = pix[0];
2844 const int q1 = pix[1*xstride];
2845
2846 if( ABS( p0 - q0 ) < alpha &&
2847 ABS( p1 - p0 ) < beta &&
2848 ABS( q1 - q0 ) < beta ) {
2849
2850 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2851 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2852 }
2853 pix += ystride;
2854 }
2855 }
2856 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2857 {
2858 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2859 }
2860 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2861 {
2862 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2863 }
2864
2865 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2866 {
2867 int s, i;
2868
2869 s = 0;
2870 for(i=0;i<h;i++) {
2871 s += abs(pix1[0] - pix2[0]);
2872 s += abs(pix1[1] - pix2[1]);
2873 s += abs(pix1[2] - pix2[2]);
2874 s += abs(pix1[3] - pix2[3]);
2875 s += abs(pix1[4] - pix2[4]);
2876 s += abs(pix1[5] - pix2[5]);
2877 s += abs(pix1[6] - pix2[6]);
2878 s += abs(pix1[7] - pix2[7]);
2879 s += abs(pix1[8] - pix2[8]);
2880 s += abs(pix1[9] - pix2[9]);
2881 s += abs(pix1[10] - pix2[10]);
2882 s += abs(pix1[11] - pix2[11]);
2883 s += abs(pix1[12] - pix2[12]);
2884 s += abs(pix1[13] - pix2[13]);
2885 s += abs(pix1[14] - pix2[14]);
2886 s += abs(pix1[15] - pix2[15]);
2887 pix1 += line_size;
2888 pix2 += line_size;
2889 }
2890 return s;
2891 }
2892
2893 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2894 {
2895 int s, i;
2896
2897 s = 0;
2898 for(i=0;i<h;i++) {
2899 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2900 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2901 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2902 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2903 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2904 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2905 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2906 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2907 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2908 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2909 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2910 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2911 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2912 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2913 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2914 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2915 pix1 += line_size;
2916 pix2 += line_size;
2917 }
2918 return s;
2919 }
2920
2921 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2922 {
2923 int s, i;
2924 uint8_t *pix3 = pix2 + line_size;
2925
2926 s = 0;
2927 for(i=0;i<h;i++) {
2928 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2929 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2930 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2931 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2932 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2933 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2934 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2935 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2936 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2937 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2938 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2939 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2940 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2941 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2942 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2943 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2944 pix1 += line_size;
2945 pix2 += line_size;
2946 pix3 += line_size;
2947 }
2948 return s;
2949 }
2950
2951 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2952 {
2953 int s, i;
2954 uint8_t *pix3 = pix2 + line_size;
2955
2956 s = 0;
2957 for(i=0;i<h;i++) {
2958 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2959 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2960 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2961 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2962 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2963 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2964 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2965 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2966 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2967 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2968 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2969 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2970 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2971 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2972 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2973 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2974 pix1 += line_size;
2975 pix2 += line_size;
2976 pix3 += line_size;
2977 }
2978 return s;
2979 }
2980
2981 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2982 {
2983 int s, i;
2984
2985 s = 0;
2986 for(i=0;i<h;i++) {
2987 s += abs(pix1[0] - pix2[0]);
2988 s += abs(pix1[1] - pix2[1]);
2989 s += abs(pix1[2] - pix2[2]);
2990 s += abs(pix1[3] - pix2[3]);
2991 s += abs(pix1[4] - pix2[4]);
2992 s += abs(pix1[5] - pix2[5]);
2993 s += abs(pix1[6] - pix2[6]);
2994 s += abs(pix1[7] - pix2[7]);
2995 pix1 += line_size;
2996 pix2 += line_size;
2997 }
2998 return s;
2999 }
3000
3001 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3002 {
3003 int s, i;
3004
3005 s = 0;
3006 for(i=0;i<h;i++) {
3007 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3008 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3009 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3010 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3011 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3012 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3013 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3014 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3015 pix1 += line_size;
3016 pix2 += line_size;
3017 }
3018 return s;
3019 }
3020
3021 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3022 {
3023 int s, i;
3024 uint8_t *pix3 = pix2 + line_size;
3025
3026 s = 0;
3027 for(i=0;i<h;i++) {
3028 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3029 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3030 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3031 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3032 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3033 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3034 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3035 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3036 pix1 += line_size;
3037 pix2 += line_size;
3038 pix3 += line_size;
3039 }
3040 return s;
3041 }
3042
3043 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3044 {
3045 int s, i;
3046 uint8_t *pix3 = pix2 + line_size;
3047
3048 s = 0;
3049 for(i=0;i<h;i++) {
3050 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3051 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3052 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3053 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3054 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3055 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3056 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3057 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3058 pix1 += line_size;
3059 pix2 += line_size;
3060 pix3 += line_size;
3061 }
3062 return s;
3063 }
3064
3065 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3066 MpegEncContext *c = v;
3067 int score1=0;
3068 int score2=0;
3069 int x,y;
3070
3071 for(y=0; y<h; y++){
3072 for(x=0; x<16; x++){
3073 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3074 }
3075 if(y+1<h){
3076 for(x=0; x<15; x++){
3077 score2+= ABS( s1[x ] - s1[x +stride]
3078 - s1[x+1] + s1[x+1+stride])
3079 -ABS( s2[x ] - s2[x +stride]
3080 - s2[x+1] + s2[x+1+stride]);
3081 }
3082 }
3083 s1+= stride;
3084 s2+= stride;
3085 }
3086
3087 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3088 else return score1 + ABS(score2)*8;
3089 }
3090
3091 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3092 MpegEncContext *c = v;
3093 int score1=0;
3094 int score2=0;
3095 int x,y;
3096
3097 for(y=0; y<h; y++){
3098 for(x=0; x<8; x++){
3099 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3100 }
3101 if(y+1<h){
3102 for(x=0; x<7; x++){
3103 score2+= ABS( s1[x ] - s1[x +stride]
3104 - s1[x+1] + s1[x+1+stride])
3105 -ABS( s2[x ] - s2[x +stride]
3106 - s2[x+1] + s2[x+1+stride]);
3107 }
3108 }
3109 s1+= stride;
3110 s2+= stride;
3111 }
3112
3113 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3114 else return score1 + ABS(score2)*8;
3115 }
3116
3117 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3118 int i;
3119 unsigned int sum=0;
3120
3121 for(i=0; i<8*8; i++){
3122 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3123 int w= weight[i];
3124 b>>= RECON_SHIFT;
3125 assert(-512<b && b<512);
3126
3127 sum += (w*b)*(w*b)>>4;
3128 }
3129 return sum>>2;
3130 }
3131
3132 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3133 int i;
3134
3135 for(i=0; i<8*8; i++){
3136 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3137 }
3138 }
3139
3140 /**
3141 * permutes an 8x8 block.
3142 * @param block the block which will be permuted according to the given permutation vector
3143 * @param permutation the permutation vector
3144 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3145 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3146 * (inverse) permutated to scantable order!
3147 */
3148 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3149 {
3150 int i;
3151 DCTELEM temp[64];
3152
3153 if(last<=0) return;
3154 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3155
3156 for(i=0; i<=last; i++){
3157 const int j= scantable[i];
3158 temp[j]= block[j];
3159 block[j]=0;
3160 }
3161
3162 for(i=0; i<=last; i++){
3163 const int j= scantable[i];
3164 const int perm_j= permutation[j];
3165 block[perm_j]= temp[j];
3166 }
3167 }
3168
3169 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3170 return 0;
3171 }
3172
3173 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3174 int i;
3175
3176 memset(cmp, 0, sizeof(void*)*5);
3177
3178 for(i=0; i<5; i++){
3179 switch(type&0xFF){
3180 case FF_CMP_SAD:
3181 cmp[i]= c->sad[i];
3182 break;
3183 case FF_CMP_SATD:
3184 cmp[i]= c->hadamard8_diff[i];
3185 break;
3186 case FF_CMP_SSE:
3187 cmp[i]= c->sse[i];
3188 break;
3189 case FF_CMP_DCT:
3190 cmp[i]= c->dct_sad[i];
3191 break;
3192 case FF_CMP_DCT264:
3193 cmp[i]= c->dct264_sad[i];
3194 break;
3195 case FF_CMP_DCTMAX:
3196 cmp[i]= c->dct_max[i];
3197 break;
3198 case FF_CMP_PSNR:
3199 cmp[i]= c->quant_psnr[i];
3200 break;
3201 case FF_CMP_BIT:
3202 cmp[i]= c->bit[i];
3203 break;
3204 case FF_CMP_RD:
3205 cmp[i]= c->rd[i];
3206 break;
3207 case FF_CMP_VSAD:
3208 cmp[i]= c->vsad[i];
3209 break;
3210 case FF_CMP_VSSE:
3211 cmp[i]= c->vsse[i];
3212 break;
3213 case FF_CMP_ZERO:
3214 cmp[i]= zero_cmp;
3215 break;
3216 case FF_CMP_NSSE:
3217 cmp[i]= c->nsse[i];
3218 break;
3219 case FF_CMP_W53:
3220 cmp[i]= c->w53[i];
3221 break;
3222 case FF_CMP_W97:
3223 cmp[i]= c->w97[i];
3224 break;
3225 default:
3226 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3227 }
3228 }
3229 }
3230
3231 /**
3232 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3233 */
3234 static void clear_blocks_c(DCTELEM *blocks)
3235 {
3236 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3237 }
3238
3239 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3240 int i;
3241 for(i=0; i+7<w; i+=8){
3242 dst[i+0] += src[i+0];
3243 dst[i+1] += src[i+1];
3244 dst[i+2] += src[i+2];
3245 dst[i+3] += src[i+3];
3246 dst[i+4] += src[i+4];
3247 dst[i+5] += src[i+5];
3248 dst[i+6] += src[i+6];
3249 dst[i+7] += src[i+7];
3250 }
3251 for(; i<w; i++)
3252 dst[i+0] += src[i+0];
3253 }
3254
3255 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3256 int i;
3257 for(i=0; i+7<w; i+=8){
3258 dst[i+0] = src1[i+0]-src2[i+0];
3259 dst[i+1] = src1[i+1]-src2[i+1];
3260 dst[i+2] = src1[i+2]-src2[i+2];
3261 dst[i+3] = src1[i+3]-src2[i+3];
3262 dst[i+4] = src1[i+4]-src2[i+4];
3263 dst[i+5] = src1[i+5]-src2[i+5];
3264 dst[i+6] = src1[i+6]-src2[i+6];
3265 dst[i+7] = src1[i+7]-src2[i+7];
3266 }
3267 for(; i<w; i++)
3268 dst[i+0] = src1[i+0]-src2[i+0];
3269 }
3270
3271 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3272 int i;
3273 uint8_t l, lt;
3274
3275 l= *left;
3276 lt= *left_top;
3277
3278 for(i=0; i<w; i++){
3279 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3280 lt= src1[i];
3281 l= src2[i];
3282 dst[i]= l - pred;
3283 }
3284
3285 *left= l;
3286 *left_top= lt;
3287 }
3288
3289 #define BUTTERFLY2(o1,o2,i1,i2) \
3290 o1= (i1)+(i2);\
3291 o2= (i1)-(i2);
3292
3293 #define BUTTERFLY1(x,y) \
3294 {\
3295 int a,b;\
3296 a= x;\
3297 b= y;\
3298 x= a+b;\
3299 y= a-b;\
3300 }
3301
3302 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3303
3304 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3305 int i;
3306 int temp[64];
3307 int sum=0;
3308
3309 assert(h==8);
3310
3311 for(i=0; i<8; i++){
3312 //FIXME try pointer walks
3313 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3314 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3315 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3316 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3317
3318 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3319 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3320 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3321 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3322
3323 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3324 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3325 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3326 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3327 }
3328
3329 for(i=0; i<8; i++){
3330 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3331 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3332 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3333 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3334
3335 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3336 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3337 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3338 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3339
3340 sum +=
3341 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3342 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3343 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3344 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3345 }
3346 #if 0
3347 static int maxi=0;
3348 if(sum>maxi){
3349 maxi=sum;
3350 printf("MAX:%d\n", maxi);
3351 }
3352 #endif
3353 return sum;
3354 }
3355
3356 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3357 int i;
3358 int temp[64];
3359 int sum=0;
3360
3361 assert(h==8);
3362
3363 for(i=0; i<8; i++){
3364 //FIXME try pointer walks
3365 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3366 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3367 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3368 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3369
3370 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3371 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3372 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3373 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3374
3375 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3376 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3377 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3378 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3379 }
3380
3381 for(i=0; i<8; i++){
3382 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3383 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3384 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3385 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3386
3387 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3388 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3389 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3390 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3391
3392 sum +=
3393 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3394 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3395 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3396 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3397 }
3398
3399 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3400
3401 return sum;
3402 }
3403
3404 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3405 MpegEncContext * const s= (MpegEncContext *)c;
3406 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3407 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3408 int sum=0, i;
3409
3410 assert(h==8);
3411
3412 s->dsp.diff_pixels(temp, src1, src2, stride);
3413 s->dsp.fdct(temp);
3414
3415 for(i=0; i<64; i++)
3416 sum+= ABS(temp[i]);
3417
3418 return sum;
3419 }
3420
3421 #ifdef CONFIG_GPL
3422 #define DCT8_1D {\
3423 const int s07 = SRC(0) + SRC(7);\
3424 const int s16 = SRC(1) + SRC(6);\
3425 const int s25 = SRC(2) + SRC(5);\
3426 const int s34 = SRC(3) + SRC(4);\
3427 const int a0 = s07 + s34;\
3428 const int a1 = s16 + s25;\
3429 const int a2 = s07 - s34;\
3430 const int a3 = s16 - s25;\
3431 const int d07 = SRC(0) - SRC(7);\
3432 const int d16 = SRC(1) - SRC(6);\
3433 const int d25 = SRC(2) - SRC(5);\
3434 const int d34 = SRC(3) - SRC(4);\
3435 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3436 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3437 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3438 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3439 DST(0, a0 + a1 ) ;\
3440 DST(1, a4 + (a7>>2)) ;\
3441 DST(2, a2 + (a3>>1)) ;\
3442 DST(3, a5 + (a6>>2)) ;\
3443 DST(4, a0 - a1 ) ;\
3444 DST(5, a6 - (a5>>2)) ;\
3445 DST(6, (a2>>1) - a3 ) ;\
3446 DST(7, (a4>>2) - a7 ) ;\
3447 }
3448
3449 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3450 MpegEncContext * const s= (MpegEncContext *)c;