0fa95fffe873598925dddaca1e3b4b76c583a81e
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
32 #include "faandct.h"
33
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
35 uint32_t squareTbl[512] = {0, };
36
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46 };
47
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59 };
60
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
63
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73 };
74
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84 };
85
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120 };
121
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 };
133
134 static int pix_sum_c(uint8_t * pix, int line_size)
135 {
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154 }
155
156 static int pix_norm1_c(uint8_t * pix, int line_size)
157 {
158 int s, i, j;
159 uint32_t *sq = squareTbl + 256;
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
164 #if 0
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
173 #else
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184 #else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195 #endif
196 #endif
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202 }
203
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220 }
221
222 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223 {
224 int s, i;
225 uint32_t *sq = squareTbl + 256;
226
227 s = 0;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 pix1 += line_size;
234 pix2 += line_size;
235 }
236 return s;
237 }
238
239 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
240 {
241 int s, i;
242 uint32_t *sq = squareTbl + 256;
243
244 s = 0;
245 for (i = 0; i < h; i++) {
246 s += sq[pix1[0] - pix2[0]];
247 s += sq[pix1[1] - pix2[1]];
248 s += sq[pix1[2] - pix2[2]];
249 s += sq[pix1[3] - pix2[3]];
250 s += sq[pix1[4] - pix2[4]];
251 s += sq[pix1[5] - pix2[5]];
252 s += sq[pix1[6] - pix2[6]];
253 s += sq[pix1[7] - pix2[7]];
254 pix1 += line_size;
255 pix2 += line_size;
256 }
257 return s;
258 }
259
260 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
261 {
262 int s, i;
263 uint32_t *sq = squareTbl + 256;
264
265 s = 0;
266 for (i = 0; i < h; i++) {
267 s += sq[pix1[ 0] - pix2[ 0]];
268 s += sq[pix1[ 1] - pix2[ 1]];
269 s += sq[pix1[ 2] - pix2[ 2]];
270 s += sq[pix1[ 3] - pix2[ 3]];
271 s += sq[pix1[ 4] - pix2[ 4]];
272 s += sq[pix1[ 5] - pix2[ 5]];
273 s += sq[pix1[ 6] - pix2[ 6]];
274 s += sq[pix1[ 7] - pix2[ 7]];
275 s += sq[pix1[ 8] - pix2[ 8]];
276 s += sq[pix1[ 9] - pix2[ 9]];
277 s += sq[pix1[10] - pix2[10]];
278 s += sq[pix1[11] - pix2[11]];
279 s += sq[pix1[12] - pix2[12]];
280 s += sq[pix1[13] - pix2[13]];
281 s += sq[pix1[14] - pix2[14]];
282 s += sq[pix1[15] - pix2[15]];
283
284 pix1 += line_size;
285 pix2 += line_size;
286 }
287 return s;
288 }
289
290
291 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
292 int s, i, j;
293 const int dec_count= w==8 ? 3 : 4;
294 int tmp[16*16];
295 #if 0
296 int level, ori;
297 static const int scale[2][2][4][4]={
298 {
299 {
300 //8x8 dec=3
301 {268, 239, 239, 213},
302 { 0, 224, 224, 152},
303 { 0, 135, 135, 110},
304 },{
305 //16x16 dec=4
306 {344, 310, 310, 280},
307 { 0, 320, 320, 228},
308 { 0, 175, 175, 136},
309 { 0, 129, 129, 102},
310 }
311 },{
312 {//FIXME 5/3
313 //8x8 dec=3
314 {275, 245, 245, 218},
315 { 0, 230, 230, 156},
316 { 0, 138, 138, 113},
317 },{
318 //16x16 dec=4
319 {352, 317, 317, 286},
320 { 0, 328, 328, 233},
321 { 0, 180, 180, 140},
322 { 0, 132, 132, 105},
323 }
324 }
325 };
326 #endif
327
328 for (i = 0; i < h; i++) {
329 for (j = 0; j < w; j+=4) {
330 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
331 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
332 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
333 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
334 }
335 pix1 += line_size;
336 pix2 += line_size;
337 }
338 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
339
340 s=0;
341 #if 0
342 for(level=0; level<dec_count; level++){
343 for(ori= level ? 1 : 0; ori<4; ori++){
344 int sx= (ori&1) ? 1<<level: 0;
345 int stride= 16<<(dec_count-level);
346 int sy= (ori&2) ? stride>>1 : 0;
347 int size= 1<<level;
348
349 for(i=0; i<size; i++){
350 for(j=0; j<size; j++){
351 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
352 s += ABS(v);
353 }
354 }
355 }
356 }
357 #endif
358 for (i = 0; i < h; i++) {
359 for (j = 0; j < w; j+=4) {
360 s+= ABS(tmp[16*i+j+0]);
361 s+= ABS(tmp[16*i+j+1]);
362 s+= ABS(tmp[16*i+j+2]);
363 s+= ABS(tmp[16*i+j+3]);
364 }
365 }
366 assert(s>=0);
367
368 return s>>2;
369 }
370
371 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372 return w_c(v, pix1, pix2, line_size, 8, h, 1);
373 }
374
375 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376 return w_c(v, pix1, pix2, line_size, 8, h, 0);
377 }
378
379 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380 return w_c(v, pix1, pix2, line_size, 16, h, 1);
381 }
382
383 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384 return w_c(v, pix1, pix2, line_size, 16, h, 0);
385 }
386
387 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
388 {
389 int i;
390
391 /* read the pixels */
392 for(i=0;i<8;i++) {
393 block[0] = pixels[0];
394 block[1] = pixels[1];
395 block[2] = pixels[2];
396 block[3] = pixels[3];
397 block[4] = pixels[4];
398 block[5] = pixels[5];
399 block[6] = pixels[6];
400 block[7] = pixels[7];
401 pixels += line_size;
402 block += 8;
403 }
404 }
405
406 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
407 const uint8_t *s2, int stride){
408 int i;
409
410 /* read the pixels */
411 for(i=0;i<8;i++) {
412 block[0] = s1[0] - s2[0];
413 block[1] = s1[1] - s2[1];
414 block[2] = s1[2] - s2[2];
415 block[3] = s1[3] - s2[3];
416 block[4] = s1[4] - s2[4];
417 block[5] = s1[5] - s2[5];
418 block[6] = s1[6] - s2[6];
419 block[7] = s1[7] - s2[7];
420 s1 += stride;
421 s2 += stride;
422 block += 8;
423 }
424 }
425
426
427 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
428 int line_size)
429 {
430 int i;
431 uint8_t *cm = cropTbl + MAX_NEG_CROP;
432
433 /* read the pixels */
434 for(i=0;i<8;i++) {
435 pixels[0] = cm[block[0]];
436 pixels[1] = cm[block[1]];
437 pixels[2] = cm[block[2]];
438 pixels[3] = cm[block[3]];
439 pixels[4] = cm[block[4]];
440 pixels[5] = cm[block[5]];
441 pixels[6] = cm[block[6]];
442 pixels[7] = cm[block[7]];
443
444 pixels += line_size;
445 block += 8;
446 }
447 }
448
449 static void put_signed_pixels_clamped_c(const DCTELEM *block,
450 uint8_t *restrict pixels,
451 int line_size)
452 {
453 int i, j;
454
455 for (i = 0; i < 8; i++) {
456 for (j = 0; j < 8; j++) {
457 if (*block < -128)
458 *pixels = 0;
459 else if (*block > 127)
460 *pixels = 255;
461 else
462 *pixels = (uint8_t)(*block + 128);
463 block++;
464 pixels++;
465 }
466 pixels += (line_size - 8);
467 }
468 }
469
470 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
471 int line_size)
472 {
473 int i;
474 uint8_t *cm = cropTbl + MAX_NEG_CROP;
475
476 /* read the pixels */
477 for(i=0;i<8;i++) {
478 pixels[0] = cm[pixels[0] + block[0]];
479 pixels[1] = cm[pixels[1] + block[1]];
480 pixels[2] = cm[pixels[2] + block[2]];
481 pixels[3] = cm[pixels[3] + block[3]];
482 pixels[4] = cm[pixels[4] + block[4]];
483 pixels[5] = cm[pixels[5] + block[5]];
484 pixels[6] = cm[pixels[6] + block[6]];
485 pixels[7] = cm[pixels[7] + block[7]];
486 pixels += line_size;
487 block += 8;
488 }
489 }
490 #if 0
491
492 #define PIXOP2(OPNAME, OP) \
493 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
494 {\
495 int i;\
496 for(i=0; i<h; i++){\
497 OP(*((uint64_t*)block), LD64(pixels));\
498 pixels+=line_size;\
499 block +=line_size;\
500 }\
501 }\
502 \
503 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
504 {\
505 int i;\
506 for(i=0; i<h; i++){\
507 const uint64_t a= LD64(pixels );\
508 const uint64_t b= LD64(pixels+1);\
509 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
510 pixels+=line_size;\
511 block +=line_size;\
512 }\
513 }\
514 \
515 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
516 {\
517 int i;\
518 for(i=0; i<h; i++){\
519 const uint64_t a= LD64(pixels );\
520 const uint64_t b= LD64(pixels+1);\
521 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
522 pixels+=line_size;\
523 block +=line_size;\
524 }\
525 }\
526 \
527 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
528 {\
529 int i;\
530 for(i=0; i<h; i++){\
531 const uint64_t a= LD64(pixels );\
532 const uint64_t b= LD64(pixels+line_size);\
533 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
534 pixels+=line_size;\
535 block +=line_size;\
536 }\
537 }\
538 \
539 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
540 {\
541 int i;\
542 for(i=0; i<h; i++){\
543 const uint64_t a= LD64(pixels );\
544 const uint64_t b= LD64(pixels+line_size);\
545 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
546 pixels+=line_size;\
547 block +=line_size;\
548 }\
549 }\
550 \
551 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
552 {\
553 int i;\
554 const uint64_t a= LD64(pixels );\
555 const uint64_t b= LD64(pixels+1);\
556 uint64_t l0= (a&0x0303030303030303ULL)\
557 + (b&0x0303030303030303ULL)\
558 + 0x0202020202020202ULL;\
559 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
560 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
561 uint64_t l1,h1;\
562 \
563 pixels+=line_size;\
564 for(i=0; i<h; i+=2){\
565 uint64_t a= LD64(pixels );\
566 uint64_t b= LD64(pixels+1);\
567 l1= (a&0x0303030303030303ULL)\
568 + (b&0x0303030303030303ULL);\
569 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
570 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
571 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
572 pixels+=line_size;\
573 block +=line_size;\
574 a= LD64(pixels );\
575 b= LD64(pixels+1);\
576 l0= (a&0x0303030303030303ULL)\
577 + (b&0x0303030303030303ULL)\
578 + 0x0202020202020202ULL;\
579 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
580 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
581 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
582 pixels+=line_size;\
583 block +=line_size;\
584 }\
585 }\
586 \
587 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
588 {\
589 int i;\
590 const uint64_t a= LD64(pixels );\
591 const uint64_t b= LD64(pixels+1);\
592 uint64_t l0= (a&0x0303030303030303ULL)\
593 + (b&0x0303030303030303ULL)\
594 + 0x0101010101010101ULL;\
595 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
596 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
597 uint64_t l1,h1;\
598 \
599 pixels+=line_size;\
600 for(i=0; i<h; i+=2){\
601 uint64_t a= LD64(pixels );\
602 uint64_t b= LD64(pixels+1);\
603 l1= (a&0x0303030303030303ULL)\
604 + (b&0x0303030303030303ULL);\
605 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
606 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
607 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
608 pixels+=line_size;\
609 block +=line_size;\
610 a= LD64(pixels );\
611 b= LD64(pixels+1);\
612 l0= (a&0x0303030303030303ULL)\
613 + (b&0x0303030303030303ULL)\
614 + 0x0101010101010101ULL;\
615 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
616 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
617 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
618 pixels+=line_size;\
619 block +=line_size;\
620 }\
621 }\
622 \
623 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
624 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
625 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
626 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
627 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
628 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
629 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
630
631 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
632 #else // 64 bit variant
633
634 #define PIXOP2(OPNAME, OP) \
635 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
636 int i;\
637 for(i=0; i<h; i++){\
638 OP(*((uint16_t*)(block )), LD16(pixels ));\
639 pixels+=line_size;\
640 block +=line_size;\
641 }\
642 }\
643 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
644 int i;\
645 for(i=0; i<h; i++){\
646 OP(*((uint32_t*)(block )), LD32(pixels ));\
647 pixels+=line_size;\
648 block +=line_size;\
649 }\
650 }\
651 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
652 int i;\
653 for(i=0; i<h; i++){\
654 OP(*((uint32_t*)(block )), LD32(pixels ));\
655 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
656 pixels+=line_size;\
657 block +=line_size;\
658 }\
659 }\
660 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
661 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
662 }\
663 \
664 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
665 int src_stride1, int src_stride2, int h){\
666 int i;\
667 for(i=0; i<h; i++){\
668 uint32_t a,b;\
669 a= LD32(&src1[i*src_stride1 ]);\
670 b= LD32(&src2[i*src_stride2 ]);\
671 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
672 a= LD32(&src1[i*src_stride1+4]);\
673 b= LD32(&src2[i*src_stride2+4]);\
674 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
675 }\
676 }\
677 \
678 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
679 int src_stride1, int src_stride2, int h){\
680 int i;\
681 for(i=0; i<h; i++){\
682 uint32_t a,b;\
683 a= LD32(&src1[i*src_stride1 ]);\
684 b= LD32(&src2[i*src_stride2 ]);\
685 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
686 a= LD32(&src1[i*src_stride1+4]);\
687 b= LD32(&src2[i*src_stride2+4]);\
688 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
689 }\
690 }\
691 \
692 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
693 int src_stride1, int src_stride2, int h){\
694 int i;\
695 for(i=0; i<h; i++){\
696 uint32_t a,b;\
697 a= LD32(&src1[i*src_stride1 ]);\
698 b= LD32(&src2[i*src_stride2 ]);\
699 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
700 }\
701 }\
702 \
703 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
704 int src_stride1, int src_stride2, int h){\
705 int i;\
706 for(i=0; i<h; i++){\
707 uint32_t a,b;\
708 a= LD16(&src1[i*src_stride1 ]);\
709 b= LD16(&src2[i*src_stride2 ]);\
710 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
711 }\
712 }\
713 \
714 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
715 int src_stride1, int src_stride2, int h){\
716 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
717 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
718 }\
719 \
720 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
721 int src_stride1, int src_stride2, int h){\
722 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
723 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
724 }\
725 \
726 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
727 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
728 }\
729 \
730 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
731 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
732 }\
733 \
734 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
735 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
736 }\
737 \
738 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
739 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
740 }\
741 \
742 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
743 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
744 int i;\
745 for(i=0; i<h; i++){\
746 uint32_t a, b, c, d, l0, l1, h0, h1;\
747 a= LD32(&src1[i*src_stride1]);\
748 b= LD32(&src2[i*src_stride2]);\
749 c= LD32(&src3[i*src_stride3]);\
750 d= LD32(&src4[i*src_stride4]);\
751 l0= (a&0x03030303UL)\
752 + (b&0x03030303UL)\
753 + 0x02020202UL;\
754 h0= ((a&0xFCFCFCFCUL)>>2)\
755 + ((b&0xFCFCFCFCUL)>>2);\
756 l1= (c&0x03030303UL)\
757 + (d&0x03030303UL);\
758 h1= ((c&0xFCFCFCFCUL)>>2)\
759 + ((d&0xFCFCFCFCUL)>>2);\
760 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
761 a= LD32(&src1[i*src_stride1+4]);\
762 b= LD32(&src2[i*src_stride2+4]);\
763 c= LD32(&src3[i*src_stride3+4]);\
764 d= LD32(&src4[i*src_stride4+4]);\
765 l0= (a&0x03030303UL)\
766 + (b&0x03030303UL)\
767 + 0x02020202UL;\
768 h0= ((a&0xFCFCFCFCUL)>>2)\
769 + ((b&0xFCFCFCFCUL)>>2);\
770 l1= (c&0x03030303UL)\
771 + (d&0x03030303UL);\
772 h1= ((c&0xFCFCFCFCUL)>>2)\
773 + ((d&0xFCFCFCFCUL)>>2);\
774 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
775 }\
776 }\
777 \
778 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
779 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
780 }\
781 \
782 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
783 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
784 }\
785 \
786 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
787 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
788 }\
789 \
790 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
791 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
792 }\
793 \
794 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
795 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
796 int i;\
797 for(i=0; i<h; i++){\
798 uint32_t a, b, c, d, l0, l1, h0, h1;\
799 a= LD32(&src1[i*src_stride1]);\
800 b= LD32(&src2[i*src_stride2]);\
801 c= LD32(&src3[i*src_stride3]);\
802 d= LD32(&src4[i*src_stride4]);\
803 l0= (a&0x03030303UL)\
804 + (b&0x03030303UL)\
805 + 0x01010101UL;\
806 h0= ((a&0xFCFCFCFCUL)>>2)\
807 + ((b&0xFCFCFCFCUL)>>2);\
808 l1= (c&0x03030303UL)\
809 + (d&0x03030303UL);\
810 h1= ((c&0xFCFCFCFCUL)>>2)\
811 + ((d&0xFCFCFCFCUL)>>2);\
812 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
813 a= LD32(&src1[i*src_stride1+4]);\
814 b= LD32(&src2[i*src_stride2+4]);\
815 c= LD32(&src3[i*src_stride3+4]);\
816 d= LD32(&src4[i*src_stride4+4]);\
817 l0= (a&0x03030303UL)\
818 + (b&0x03030303UL)\
819 + 0x01010101UL;\
820 h0= ((a&0xFCFCFCFCUL)>>2)\
821 + ((b&0xFCFCFCFCUL)>>2);\
822 l1= (c&0x03030303UL)\
823 + (d&0x03030303UL);\
824 h1= ((c&0xFCFCFCFCUL)>>2)\
825 + ((d&0xFCFCFCFCUL)>>2);\
826 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
827 }\
828 }\
829 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
830 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
831 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
832 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
833 }\
834 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
835 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
836 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
837 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
838 }\
839 \
840 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
841 {\
842 int i, a0, b0, a1, b1;\
843 a0= pixels[0];\
844 b0= pixels[1] + 2;\
845 a0 += b0;\
846 b0 += pixels[2];\
847 \
848 pixels+=line_size;\
849 for(i=0; i<h; i+=2){\
850 a1= pixels[0];\
851 b1= pixels[1];\
852 a1 += b1;\
853 b1 += pixels[2];\
854 \
855 block[0]= (a1+a0)>>2; /* FIXME non put */\
856 block[1]= (b1+b0)>>2;\
857 \
858 pixels+=line_size;\
859 block +=line_size;\
860 \
861 a0= pixels[0];\
862 b0= pixels[1] + 2;\
863 a0 += b0;\
864 b0 += pixels[2];\
865 \
866 block[0]= (a1+a0)>>2;\
867 block[1]= (b1+b0)>>2;\
868 pixels+=line_size;\
869 block +=line_size;\
870 }\
871 }\
872 \
873 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
874 {\
875 int i;\
876 const uint32_t a= LD32(pixels );\
877 const uint32_t b= LD32(pixels+1);\
878 uint32_t l0= (a&0x03030303UL)\
879 + (b&0x03030303UL)\
880 + 0x02020202UL;\
881 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
882 + ((b&0xFCFCFCFCUL)>>2);\
883 uint32_t l1,h1;\
884 \
885 pixels+=line_size;\
886 for(i=0; i<h; i+=2){\
887 uint32_t a= LD32(pixels );\
888 uint32_t b= LD32(pixels+1);\
889 l1= (a&0x03030303UL)\
890 + (b&0x03030303UL);\
891 h1= ((a&0xFCFCFCFCUL)>>2)\
892 + ((b&0xFCFCFCFCUL)>>2);\
893 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
894 pixels+=line_size;\
895 block +=line_size;\
896 a= LD32(pixels );\
897 b= LD32(pixels+1);\
898 l0= (a&0x03030303UL)\
899 + (b&0x03030303UL)\
900 + 0x02020202UL;\
901 h0= ((a&0xFCFCFCFCUL)>>2)\
902 + ((b&0xFCFCFCFCUL)>>2);\
903 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
904 pixels+=line_size;\
905 block +=line_size;\
906 }\
907 }\
908 \
909 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
910 {\
911 int j;\
912 for(j=0; j<2; j++){\
913 int i;\
914 const uint32_t a= LD32(pixels );\
915 const uint32_t b= LD32(pixels+1);\
916 uint32_t l0= (a&0x03030303UL)\
917 + (b&0x03030303UL)\
918 + 0x02020202UL;\
919 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
920 + ((b&0xFCFCFCFCUL)>>2);\
921 uint32_t l1,h1;\
922 \
923 pixels+=line_size;\
924 for(i=0; i<h; i+=2){\
925 uint32_t a= LD32(pixels );\
926 uint32_t b= LD32(pixels+1);\
927 l1= (a&0x03030303UL)\
928 + (b&0x03030303UL);\
929 h1= ((a&0xFCFCFCFCUL)>>2)\
930 + ((b&0xFCFCFCFCUL)>>2);\
931 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
932 pixels+=line_size;\
933 block +=line_size;\
934 a= LD32(pixels );\
935 b= LD32(pixels+1);\
936 l0= (a&0x03030303UL)\
937 + (b&0x03030303UL)\
938 + 0x02020202UL;\
939 h0= ((a&0xFCFCFCFCUL)>>2)\
940 + ((b&0xFCFCFCFCUL)>>2);\
941 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
942 pixels+=line_size;\
943 block +=line_size;\
944 }\
945 pixels+=4-line_size*(h+1);\
946 block +=4-line_size*h;\
947 }\
948 }\
949 \
950 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
951 {\
952 int j;\
953 for(j=0; j<2; j++){\
954 int i;\
955 const uint32_t a= LD32(pixels );\
956 const uint32_t b= LD32(pixels+1);\
957 uint32_t l0= (a&0x03030303UL)\
958 + (b&0x03030303UL)\
959 + 0x01010101UL;\
960 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
961 + ((b&0xFCFCFCFCUL)>>2);\
962 uint32_t l1,h1;\
963 \
964 pixels+=line_size;\
965 for(i=0; i<h; i+=2){\
966 uint32_t a= LD32(pixels );\
967 uint32_t b= LD32(pixels+1);\
968 l1= (a&0x03030303UL)\
969 + (b&0x03030303UL);\
970 h1= ((a&0xFCFCFCFCUL)>>2)\
971 + ((b&0xFCFCFCFCUL)>>2);\
972 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
973 pixels+=line_size;\
974 block +=line_size;\
975 a= LD32(pixels );\
976 b= LD32(pixels+1);\
977 l0= (a&0x03030303UL)\
978 + (b&0x03030303UL)\
979 + 0x01010101UL;\
980 h0= ((a&0xFCFCFCFCUL)>>2)\
981 + ((b&0xFCFCFCFCUL)>>2);\
982 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
983 pixels+=line_size;\
984 block +=line_size;\
985 }\
986 pixels+=4-line_size*(h+1);\
987 block +=4-line_size*h;\
988 }\
989 }\
990 \
991 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
992 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
993 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
994 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
995 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
996 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
997 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
998 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
999
1000 #define op_avg(a, b) a = rnd_avg32(a, b)
1001 #endif
1002 #define op_put(a, b) a = b
1003
1004 PIXOP2(avg, op_avg)
1005 PIXOP2(put, op_put)
1006 #undef op_avg
1007 #undef op_put
1008
1009 #define avg2(a,b) ((a+b+1)>>1)
1010 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1011
1012 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1013 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1014 }
1015
1016 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1017 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1018 }
1019
1020 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1021 {
1022 const int A=(16-x16)*(16-y16);
1023 const int B=( x16)*(16-y16);
1024 const int C=(16-x16)*( y16);
1025 const int D=( x16)*( y16);
1026 int i;
1027
1028 for(i=0; i<h; i++)
1029 {
1030 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1031 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1032 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1033 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1034 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1035 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1036 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1037 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1038 dst+= stride;
1039 src+= stride;
1040 }
1041 }
1042
1043 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1044 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1045 {
1046 int y, vx, vy;
1047 const int s= 1<<shift;
1048
1049 width--;
1050 height--;
1051
1052 for(y=0; y<h; y++){
1053 int x;
1054
1055 vx= ox;
1056 vy= oy;
1057 for(x=0; x<8; x++){ //XXX FIXME optimize
1058 int src_x, src_y, frac_x, frac_y, index;
1059
1060 src_x= vx>>16;
1061 src_y= vy>>16;
1062 frac_x= src_x&(s-1);
1063 frac_y= src_y&(s-1);
1064 src_x>>=shift;
1065 src_y>>=shift;
1066
1067 if((unsigned)src_x < width){
1068 if((unsigned)src_y < height){
1069 index= src_x + src_y*stride;
1070 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1071 + src[index +1]* frac_x )*(s-frac_y)
1072 + ( src[index+stride ]*(s-frac_x)
1073 + src[index+stride+1]* frac_x )* frac_y
1074 + r)>>(shift*2);
1075 }else{
1076 index= src_x + clip(src_y, 0, height)*stride;
1077 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1078 + src[index +1]* frac_x )*s
1079 + r)>>(shift*2);
1080 }
1081 }else{
1082 if((unsigned)src_y < height){
1083 index= clip(src_x, 0, width) + src_y*stride;
1084 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1085 + src[index+stride ]* frac_y )*s
1086 + r)>>(shift*2);
1087 }else{
1088 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1089 dst[y*stride + x]= src[index ];
1090 }
1091 }
1092
1093 vx+= dxx;
1094 vy+= dyx;
1095 }
1096 ox += dxy;
1097 oy += dyy;
1098 }
1099 }
1100
1101 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1102 switch(width){
1103 case 2: put_pixels2_c (dst, src, stride, height); break;
1104 case 4: put_pixels4_c (dst, src, stride, height); break;
1105 case 8: put_pixels8_c (dst, src, stride, height); break;
1106 case 16:put_pixels16_c(dst, src, stride, height); break;
1107 }
1108 }
1109
1110 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1111 int i,j;
1112 for (i=0; i < height; i++) {
1113 for (j=0; j < width; j++) {
1114 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1115 }
1116 src += stride;
1117 dst += stride;
1118 }
1119 }
1120
1121 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1122 int i,j;
1123 for (i=0; i < height; i++) {
1124 for (j=0; j < width; j++) {
1125 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1126 }
1127 src += stride;
1128 dst += stride;
1129 }
1130 }
1131
1132 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1133 int i,j;
1134 for (i=0; i < height; i++) {
1135 for (j=0; j < width; j++) {
1136 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1137 }
1138 src += stride;
1139 dst += stride;
1140 }
1141 }
1142
1143 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1144 int i,j;
1145 for (i=0; i < height; i++) {
1146 for (j=0; j < width; j++) {
1147 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1148 }
1149 src += stride;
1150 dst += stride;
1151 }
1152 }
1153
1154 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1155 int i,j;
1156 for (i=0; i < height; i++) {
1157 for (j=0; j < width; j++) {
1158 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1159 }
1160 src += stride;
1161 dst += stride;
1162 }
1163 }
1164
1165 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1166 int i,j;
1167 for (i=0; i < height; i++) {
1168 for (j=0; j < width; j++) {
1169 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1170 }
1171 src += stride;
1172 dst += stride;
1173 }
1174 }
1175
1176 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1177 int i,j;
1178 for (i=0; i < height; i++) {
1179 for (j=0; j < width; j++) {
1180 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1181 }
1182 src += stride;
1183 dst += stride;
1184 }
1185 }
1186
1187 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1188 int i,j;
1189 for (i=0; i < height; i++) {
1190 for (j=0; j < width; j++) {
1191 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1192 }
1193 src += stride;
1194 dst += stride;
1195 }
1196 }
1197
1198 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1199 switch(width){
1200 case 2: avg_pixels2_c (dst, src, stride, height); break;
1201 case 4: avg_pixels4_c (dst, src, stride, height); break;
1202 case 8: avg_pixels8_c (dst, src, stride, height); break;
1203 case 16:avg_pixels16_c(dst, src, stride, height); break;
1204 }
1205 }
1206
1207 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1208 int i,j;
1209 for (i=0; i < height; i++) {
1210 for (j=0; j < width; j++) {
1211 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1212 }
1213 src += stride;
1214 dst += stride;
1215 }
1216 }
1217
1218 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1219 int i,j;
1220 for (i=0; i < height; i++) {
1221 for (j=0; j < width; j++) {
1222 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1223 }
1224 src += stride;
1225 dst += stride;
1226 }
1227 }
1228
1229 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1230 int i,j;
1231 for (i=0; i < height; i++) {
1232 for (j=0; j < width; j++) {
1233 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1234 }
1235 src += stride;
1236 dst += stride;
1237 }
1238 }
1239
1240 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1241 int i,j;
1242 for (i=0; i < height; i++) {
1243 for (j=0; j < width; j++) {
1244 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1245 }
1246 src += stride;
1247 dst += stride;
1248 }
1249 }
1250
1251 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1252 int i,j;
1253 for (i=0; i < height; i++) {
1254 for (j=0; j < width; j++) {
1255 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1256 }
1257 src += stride;
1258 dst += stride;
1259 }
1260 }
1261
1262 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1263 int i,j;
1264 for (i=0; i < height; i++) {
1265 for (j=0; j < width; j++) {
1266 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1267 }
1268 src += stride;
1269 dst += stride;
1270 }
1271 }
1272
1273 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1274 int i,j;
1275 for (i=0; i < height; i++) {
1276 for (j=0; j < width; j++) {
1277 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1278 }
1279 src += stride;
1280 dst += stride;
1281 }
1282 }
1283
1284 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1285 int i,j;
1286 for (i=0; i < height; i++) {
1287 for (j=0; j < width; j++) {
1288 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1289 }
1290 src += stride;
1291 dst += stride;
1292 }
1293 }
1294 #if 0
1295 #define TPEL_WIDTH(width)\
1296 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1297 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1298 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1299 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1300 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1301 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1302 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1303 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1304 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1305 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1306 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1307 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1308 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1309 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1310 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1311 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1312 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1313 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1314 #endif
1315
1316 #define H264_CHROMA_MC(OPNAME, OP)\
1317 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1318 const int A=(8-x)*(8-y);\
1319 const int B=( x)*(8-y);\
1320 const int C=(8-x)*( y);\
1321 const int D=( x)*( y);\
1322 int i;\
1323 \
1324 assert(x<8 && y<8 && x>=0 && y>=0);\
1325 \
1326 for(i=0; i<h; i++)\
1327 {\
1328 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1329 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1330 dst+= stride;\
1331 src+= stride;\
1332 }\
1333 }\
1334 \
1335 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1336 const int A=(8-x)*(8-y);\
1337 const int B=( x)*(8-y);\
1338 const int C=(8-x)*( y);\
1339 const int D=( x)*( y);\
1340 int i;\
1341 \
1342 assert(x<8 && y<8 && x>=0 && y>=0);\
1343 \
1344 for(i=0; i<h; i++)\
1345 {\
1346 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1347 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1348 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1349 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1350 dst+= stride;\
1351 src+= stride;\
1352 }\
1353 }\
1354 \
1355 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1356 const int A=(8-x)*(8-y);\
1357 const int B=( x)*(8-y);\
1358 const int C=(8-x)*( y);\
1359 const int D=( x)*( y);\
1360 int i;\
1361 \
1362 assert(x<8 && y<8 && x>=0 && y>=0);\
1363 \
1364 for(i=0; i<h; i++)\
1365 {\
1366 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1367 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1368 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1369 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1370 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1371 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1372 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1373 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1374 dst+= stride;\
1375 src+= stride;\
1376 }\
1377 }
1378
1379 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1380 #define op_put(a, b) a = (((b) + 32)>>6)
1381
1382 H264_CHROMA_MC(put_ , op_put)
1383 H264_CHROMA_MC(avg_ , op_avg)
1384 #undef op_avg
1385 #undef op_put
1386
1387 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1388 {
1389 int i;
1390 for(i=0; i<h; i++)
1391 {
1392 ST32(dst , LD32(src ));
1393 dst+=dstStride;
1394 src+=srcStride;
1395 }
1396 }
1397
1398 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1399 {
1400 int i;
1401 for(i=0; i<h; i++)
1402 {
1403 ST32(dst , LD32(src ));
1404 ST32(dst+4 , LD32(src+4 ));
1405 dst+=dstStride;
1406 src+=srcStride;
1407 }
1408 }
1409
1410 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1411 {
1412 int i;
1413 for(i=0; i<h; i++)
1414 {
1415 ST32(dst , LD32(src ));
1416 ST32(dst+4 , LD32(src+4 ));
1417 ST32(dst+8 , LD32(src+8 ));
1418 ST32(dst+12, LD32(src+12));
1419 dst+=dstStride;
1420 src+=srcStride;
1421 }
1422 }
1423
1424 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1425 {
1426 int i;
1427 for(i=0; i<h; i++)
1428 {
1429 ST32(dst , LD32(src ));
1430 ST32(dst+4 , LD32(src+4 ));
1431 ST32(dst+8 , LD32(src+8 ));
1432 ST32(dst+12, LD32(src+12));
1433 dst[16]= src[16];
1434 dst+=dstStride;
1435 src+=srcStride;
1436 }
1437 }
1438
1439 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1440 {
1441 int i;
1442 for(i=0; i<h; i++)
1443 {
1444 ST32(dst , LD32(src ));
1445 ST32(dst+4 , LD32(src+4 ));
1446 dst[8]= src[8];
1447 dst+=dstStride;
1448 src+=srcStride;
1449 }
1450 }
1451
1452
1453 #define QPEL_MC(r, OPNAME, RND, OP) \
1454 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1455 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1456 int i;\
1457 for(i=0; i<h; i++)\
1458 {\
1459 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1460 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1461 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1462 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1463 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1464 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1465 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1466 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1467 dst+=dstStride;\
1468 src+=srcStride;\
1469 }\
1470 }\
1471 \
1472 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1473 const int w=8;\
1474 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1475 int i;\
1476 for(i=0; i<w; i++)\
1477 {\
1478 const int src0= src[0*srcStride];\
1479 const int src1= src[1*srcStride];\
1480 const int src2= src[2*srcStride];\
1481 const int src3= src[3*srcStride];\
1482 const int src4= src[4*srcStride];\
1483 const int src5= src[5*srcStride];\
1484 const int src6= src[6*srcStride];\
1485 const int src7= src[7*srcStride];\
1486 const int src8= src[8*srcStride];\
1487 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1488 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1489 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1490 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1491 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1492 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1493 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1494 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1495 dst++;\
1496 src++;\
1497 }\
1498 }\
1499 \
1500 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1501 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1502 int i;\
1503 \
1504 for(i=0; i<h; i++)\
1505 {\
1506 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1507 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1508 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1509 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1510 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1511 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1512 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1513 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1514 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1515 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1516 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1517 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1518 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1519 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1520 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1521 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1522 dst+=dstStride;\
1523 src+=srcStride;\
1524 }\
1525 }\
1526 \
1527 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1528 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1529 int i;\
1530 const int w=16;\
1531 for(i=0; i<w; i++)\
1532 {\
1533 const int src0= src[0*srcStride];\
1534 const int src1= src[1*srcStride];\
1535 const int src2= src[2*srcStride];\
1536 const int src3= src[3*srcStride];\
1537 const int src4= src[4*srcStride];\
1538 const int src5= src[5*srcStride];\
1539 const int src6= src[6*srcStride];\
1540 const int src7= src[7*srcStride];\
1541 const int src8= src[8*srcStride];\
1542 const int src9= src[9*srcStride];\
1543 const int src10= src[10*srcStride];\
1544 const int src11= src[11*srcStride];\
1545 const int src12= src[12*srcStride];\
1546 const int src13= src[13*srcStride];\
1547 const int src14= src[14*srcStride];\
1548 const int src15= src[15*srcStride];\
1549 const int src16= src[16*srcStride];\
1550 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1551 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1552 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1553 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1554 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1555 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1556 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1557 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1558 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1559 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1560 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1561 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1562 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1563 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1564 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1565 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1566 dst++;\
1567 src++;\
1568 }\
1569 }\
1570 \
1571 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1572 OPNAME ## pixels8_c(dst, src, stride, 8);\
1573 }\
1574 \
1575 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1576 uint8_t half[64];\
1577 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1578 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1579 }\
1580 \
1581 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1582 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1583 }\
1584 \
1585 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1586 uint8_t half[64];\
1587 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1588 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1589 }\
1590 \
1591 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1592 uint8_t full[16*9];\
1593 uint8_t half[64];\
1594 copy_block9(full, src, 16, stride, 9);\
1595 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1596 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1597 }\
1598 \
1599 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1600 uint8_t full[16*9];\
1601 copy_block9(full, src, 16, stride, 9);\
1602 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1603 }\
1604 \
1605 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1606 uint8_t full[16*9];\
1607 uint8_t half[64];\
1608 copy_block9(full, src, 16, stride, 9);\
1609 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1610 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1611 }\
1612 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1613 uint8_t full[16*9];\
1614 uint8_t halfH[72];\
1615 uint8_t halfV[64];\
1616 uint8_t halfHV[64];\
1617 copy_block9(full, src, 16, stride, 9);\
1618 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1619 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1620 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1621 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1622 }\
1623 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1624 uint8_t full[16*9];\
1625 uint8_t halfH[72];\
1626 uint8_t halfHV[64];\
1627 copy_block9(full, src, 16, stride, 9);\
1628 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1629 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1630 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1631 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1632 }\
1633 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1634 uint8_t full[16*9];\
1635 uint8_t halfH[72];\
1636 uint8_t halfV[64];\
1637 uint8_t halfHV[64];\
1638 copy_block9(full, src, 16, stride, 9);\
1639 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1640 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1641 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1642 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1643 }\
1644 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1645 uint8_t full[16*9];\
1646 uint8_t halfH[72];\
1647 uint8_t halfHV[64];\
1648 copy_block9(full, src, 16, stride, 9);\
1649 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1650 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1651 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1652 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1653 }\
1654 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1655 uint8_t full[16*9];\
1656 uint8_t halfH[72];\
1657 uint8_t halfV[64];\
1658 uint8_t halfHV[64];\
1659 copy_block9(full, src, 16, stride, 9);\
1660 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1661 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1662 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1663 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1664 }\
1665 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1666 uint8_t full[16*9];\
1667 uint8_t halfH[72];\
1668 uint8_t halfHV[64];\
1669 copy_block9(full, src, 16, stride, 9);\
1670 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1671 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1672 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1673 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1674 }\
1675 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1676 uint8_t full[16*9];\
1677 uint8_t halfH[72];\
1678 uint8_t halfV[64];\
1679 uint8_t halfHV[64];\
1680 copy_block9(full, src, 16, stride, 9);\
1681 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1682 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1683 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1684 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1685 }\
1686 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1687 uint8_t full[16*9];\
1688 uint8_t halfH[72];\
1689 uint8_t halfHV[64];\
1690 copy_block9(full, src, 16, stride, 9);\
1691 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1692 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1693 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1694 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1695 }\
1696 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1697 uint8_t halfH[72];\
1698 uint8_t halfHV[64];\
1699 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1700 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1701 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1702 }\
1703 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1704 uint8_t halfH[72];\
1705 uint8_t halfHV[64];\
1706 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1707 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1708 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1709 }\
1710 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
1712 uint8_t halfH[72];\
1713 uint8_t halfV[64];\
1714 uint8_t halfHV[64];\
1715 copy_block9(full, src, 16, stride, 9);\
1716 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1717 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1718 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1719 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1720 }\
1721 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1722 uint8_t full[16*9];\
1723 uint8_t halfH[72];\
1724 copy_block9(full, src, 16, stride, 9);\
1725 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1726 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1727 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1728 }\
1729 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1730 uint8_t full[16*9];\
1731 uint8_t halfH[72];\
1732 uint8_t halfV[64];\
1733 uint8_t halfHV[64];\
1734 copy_block9(full, src, 16, stride, 9);\
1735 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1736 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1737 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1738 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1739 }\
1740 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1741 uint8_t full[16*9];\
1742 uint8_t halfH[72];\
1743 copy_block9(full, src, 16, stride, 9);\
1744 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1745 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1746 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1747 }\
1748 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1749 uint8_t halfH[72];\
1750 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1751 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1752 }\
1753 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1754 OPNAME ## pixels16_c(dst, src, stride, 16);\
1755 }\
1756 \
1757 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1758 uint8_t half[256];\
1759 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1760 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1761 }\
1762 \
1763 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1764 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1765 }\
1766 \
1767 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1768 uint8_t half[256];\
1769 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1770 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1771 }\
1772 \
1773 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[24*17];\
1775 uint8_t half[256];\
1776 copy_block17(full, src, 24, stride, 17);\
1777 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1778 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1779 }\
1780 \
1781 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1782 uint8_t full[24*17];\
1783 copy_block17(full, src, 24, stride, 17);\
1784 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1785 }\
1786 \
1787 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1788 uint8_t full[24*17];\
1789 uint8_t half[256];\
1790 copy_block17(full, src, 24, stride, 17);\
1791 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1792 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1793 }\
1794 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1795 uint8_t full[24*17];\
1796 uint8_t halfH[272];\
1797 uint8_t halfV[256];\
1798 uint8_t halfHV[256];\
1799 copy_block17(full, src, 24, stride, 17);\
1800 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1801 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1802 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1803 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1804 }\
1805 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1806 uint8_t full[24*17];\
1807 uint8_t halfH[272];\
1808 uint8_t halfHV[256];\
1809 copy_block17(full, src, 24, stride, 17);\
1810 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1811 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1812 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1813 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1814 }\
1815 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1816 uint8_t full[24*17];\
1817 uint8_t halfH[272];\
1818 uint8_t halfV[256];\
1819 uint8_t halfHV[256];\
1820 copy_block17(full, src, 24, stride, 17);\
1821 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1822 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1823 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1824 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1825 }\
1826 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1827 uint8_t full[24*17];\
1828 uint8_t halfH[272];\
1829 uint8_t halfHV[256];\
1830 copy_block17(full, src, 24, stride, 17);\
1831 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1832 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1833 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1834 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1835 }\
1836 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t full[24*17];\
1838 uint8_t halfH[272];\
1839 uint8_t halfV[256];\
1840 uint8_t halfHV[256];\
1841 copy_block17(full, src, 24, stride, 17);\
1842 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1843 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1844 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1845 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1846 }\
1847 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1848 uint8_t full[24*17];\
1849 uint8_t halfH[272];\
1850 uint8_t halfHV[256];\
1851 copy_block17(full, src, 24, stride, 17);\
1852 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1853 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1854 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1855 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1856 }\
1857 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1858 uint8_t full[24*17];\
1859 uint8_t halfH[272];\
1860 uint8_t halfV[256];\
1861 uint8_t halfHV[256];\
1862 copy_block17(full, src, 24, stride, 17);\
1863 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1864 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1865 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1866 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1867 }\
1868 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1869 uint8_t full[24*17];\
1870 uint8_t halfH[272];\
1871 uint8_t halfHV[256];\
1872 copy_block17(full, src, 24, stride, 17);\
1873 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1874 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1875 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1876 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1877 }\
1878 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1879 uint8_t halfH[272];\
1880 uint8_t halfHV[256];\
1881 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1882 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1883 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1884 }\
1885 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1886 uint8_t halfH[272];\
1887 uint8_t halfHV[256];\
1888 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1889 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1890 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1891 }\
1892 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
1894 uint8_t halfH[272];\
1895 uint8_t halfV[256];\
1896 uint8_t halfHV[256];\
1897 copy_block17(full, src, 24, stride, 17);\
1898 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1899 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1900 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1901 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1902 }\
1903 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1904 uint8_t full[24*17];\
1905 uint8_t halfH[272];\
1906 copy_block17(full, src, 24, stride, 17);\
1907 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1908 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1909 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1910 }\
1911 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1912 uint8_t full[24*17];\
1913 uint8_t halfH[272];\
1914 uint8_t halfV[256];\
1915 uint8_t halfHV[256];\
1916 copy_block17(full, src, 24, stride, 17);\
1917 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1918 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1919 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1920 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1921 }\
1922 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1923 uint8_t full[24*17];\
1924 uint8_t halfH[272];\
1925 copy_block17(full, src, 24, stride, 17);\
1926 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1927 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1928 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1929 }\
1930 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1931 uint8_t halfH[272];\
1932 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1933 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1934 }
1935
1936 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1937 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1938 #define op_put(a, b) a = cm[((b) + 16)>>5]
1939 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1940
1941 QPEL_MC(0, put_ , _ , op_put)
1942 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1943 QPEL_MC(0, avg_ , _ , op_avg)
1944 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1945 #undef op_avg
1946 #undef op_avg_no_rnd
1947 #undef op_put
1948 #undef op_put_no_rnd
1949
1950 #if 1
1951 #define H264_LOWPASS(OPNAME, OP, OP2) \
1952 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1953 const int h=4;\
1954 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1955 int i;\
1956 for(i=0; i<h; i++)\
1957 {\
1958 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1959 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1960 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1961 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1962 dst+=dstStride;\
1963 src+=srcStride;\
1964 }\
1965 }\
1966 \
1967 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1968 const int w=4;\
1969 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1970 int i;\
1971 for(i=0; i<w; i++)\
1972 {\
1973 const int srcB= src[-2*srcStride];\
1974 const int srcA= src[-1*srcStride];\
1975 const int src0= src[0 *srcStride];\
1976 const int src1= src[1 *srcStride];\
1977 const int src2= src[2 *srcStride];\
1978 const int src3= src[3 *srcStride];\
1979 const int src4= src[4 *srcStride];\
1980 const int src5= src[5 *srcStride];\
1981 const int src6= src[6 *srcStride];\
1982 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1983 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1984 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1985 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1986 dst++;\
1987 src++;\
1988 }\
1989 }\
1990 \
1991 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1992 const int h=4;\
1993 const int w=4;\
1994 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1995 int i;\
1996 src -= 2*srcStride;\
1997 for(i=0; i<h+5; i++)\
1998 {\
1999 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2000 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2001 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2002 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2003 tmp+=tmpStride;\
2004 src+=srcStride;\
2005 }\
2006 tmp -= tmpStride*(h+5-2);\
2007 for(i=0; i<w; i++)\
2008 {\
2009 const int tmpB= tmp[-2*tmpStride];\
2010 const int tmpA= tmp[-1*tmpStride];\
2011 const int tmp0= tmp[0 *tmpStride];\
2012 const int tmp1= tmp[1 *tmpStride];\
2013 const int tmp2= tmp[2 *tmpStride];\
2014 const int tmp3= tmp[3 *tmpStride];\
2015 const int tmp4= tmp[4 *tmpStride];\
2016 const int tmp5= tmp[5 *tmpStride];\
2017 const int tmp6= tmp[6 *tmpStride];\
2018 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2019 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2020 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2021 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2022 dst++;\
2023 tmp++;\
2024 }\
2025 }\
2026 \
2027 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2028 const int h=8;\
2029 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2030 int i;\
2031 for(i=0; i<h; i++)\
2032 {\
2033 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2034 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2035 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2036 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2037 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2038 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2039 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2040 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2041 dst+=dstStride;\
2042 src+=srcStride;\
2043 }\
2044 }\
2045 \
2046 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2047 const int w=8;\
2048 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2049 int i;\
2050 for(i=0; i<w; i++)\
2051 {\
2052 const int srcB= src[-2*srcStride];\
2053 const int srcA= src[-1*srcStride];\
2054 const int src0= src[0 *srcStride];\
2055 const int src1= src[1 *srcStride];\
2056 const int src2= src[2 *srcStride];\
2057 const int src3= src[3 *srcStride];\
2058 const int src4= src[4 *srcStride];\
2059 const int src5= src[5 *srcStride];\
2060 const int src6= src[6 *srcStride];\
2061 const int src7= src[7 *srcStride];\
2062 const int src8= src[8 *srcStride];\
2063 const int src9= src[9 *srcStride];\
2064 const int src10=src[10*srcStride];\
2065 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2066 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2067 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2068 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2069 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2070 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2071 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2072 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2073 dst++;\
2074 src++;\
2075 }\
2076 }\
2077 \
2078 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2079 const int h=8;\
2080 const int w=8;\
2081 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2082 int i;\
2083 src -= 2*srcStride;\
2084 for(i=0; i<h+5; i++)\
2085 {\
2086 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2087 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2088 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2089 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2090 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2091 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2092 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2093 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2094 tmp+=tmpStride;\
2095 src+=srcStride;\
2096 }\
2097 tmp -= tmpStride*(h+5-2);\
2098 for(i=0; i<w; i++)\
2099 {\
2100 const int tmpB= tmp[-2*tmpStride];\
2101 const int tmpA= tmp[-1*tmpStride];\
2102 const int tmp0= tmp[0 *tmpStride];\
2103 const int tmp1= tmp[1 *tmpStride];\
2104 const int tmp2= tmp[2 *tmpStride];\
2105 const int tmp3= tmp[3 *tmpStride];\
2106 const int tmp4= tmp[4 *tmpStride];\
2107 const int tmp5= tmp[5 *tmpStride];\
2108 const int tmp6= tmp[6 *tmpStride];\
2109 const int tmp7= tmp[7 *tmpStride];\
2110 const int tmp8= tmp[8 *tmpStride];\
2111 const int tmp9= tmp[9 *tmpStride];\
2112 const int tmp10=tmp[10*tmpStride];\
2113 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2114 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2115 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2116 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2117 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2118 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2119 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2120 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2121 dst++;\
2122 tmp++;\
2123 }\
2124 }\
2125 \
2126 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2127 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2128 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2129 src += 8*srcStride;\
2130 dst += 8*dstStride;\
2131 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2132 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2133 }\
2134 \
2135 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2136 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2137 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2138 src += 8*srcStride;\
2139 dst += 8*dstStride;\
2140 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2141 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2142 }\
2143 \
2144 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2145 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2146 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2147 src += 8*srcStride;\
2148 tmp += 8*tmpStride;\
2149 dst += 8*dstStride;\
2150 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2151 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2152 }\
2153
2154 #define H264_MC(OPNAME, SIZE) \
2155 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2156 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2157 }\
2158 \
2159 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2160 uint8_t half[SIZE*SIZE];\
2161 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2162 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2163 }\
2164 \
2165 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2166 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2167 }\
2168 \
2169 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2170 uint8_t half[SIZE*SIZE];\
2171 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2172 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2173 }\
2174 \
2175 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2176 uint8_t full[SIZE*(SIZE+5)];\
2177 uint8_t * const full_mid= full + SIZE*2;\
2178 uint8_t half[SIZE*SIZE];\
2179 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2180 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2181 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2182 }\
2183 \
2184 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2185 uint8_t full[SIZE*(SIZE+5)];\
2186 uint8_t * const full_mid= full + SIZE*2;\
2187 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2188 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2189 }\
2190 \
2191 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2192 uint8_t full[SIZE*(SIZE+5)];\
2193 uint8_t * const full_mid= full + SIZE*2;\
2194 uint8_t half[SIZE*SIZE];\
2195 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2196 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2197 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2198 }\
2199 \
2200 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2201 uint8_t full[SIZE*(SIZE+5)];\
2202 uint8_t * const full_mid= full + SIZE*2;\
2203 uint8_t halfH[SIZE*SIZE];\
2204 uint8_t halfV[SIZE*SIZE];\
2205 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2206 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2207 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2208 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2209 }\
2210 \
2211 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2212 uint8_t full[SIZE*(SIZE+5)];\
2213 uint8_t * const full_mid= full + SIZE*2;\
2214 uint8_t halfH[SIZE*SIZE];\
2215 uint8_t halfV[SIZE*SIZE];\
2216 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2217 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2218 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2219 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2220 }\
2221 \
2222 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2223 uint8_t full[SIZE*(SIZE+5)];\
2224 uint8_t * const full_mid= full + SIZE*2;\
2225 uint8_t halfH[SIZE*SIZE];\
2226 uint8_t halfV[SIZE*SIZE];\
2227 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2228 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2229 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2230 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2231 }\
2232 \
2233 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2234 uint8_t full[SIZE*(SIZE+5)];\
2235 uint8_t * const full_mid= full + SIZE*2;\
2236 uint8_t halfH[SIZE*SIZE];\
2237 uint8_t halfV[SIZE*SIZE];\
2238 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2239 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2240 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2241 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2242 }\
2243 \
2244 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2245 int16_t tmp[SIZE*(SIZE+5)];\
2246 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2247 }\
2248 \
2249 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2250 int16_t tmp[SIZE*(SIZE+5)];\
2251 uint8_t halfH[SIZE*SIZE];\
2252 uint8_t halfHV[SIZE*SIZE];\
2253 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2254 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2255 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2256 }\
2257 \
2258 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2259 int16_t tmp[SIZE*(SIZE+5)];\
2260 uint8_t halfH[SIZE*SIZE];\
2261 uint8_t halfHV[SIZE*SIZE];\
2262 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2263 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2264 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2265 }\
2266 \
2267 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2268 uint8_t full[SIZE*(SIZE+5)];\
2269 uint8_t * const full_mid= full + SIZE*2;\
2270 int16_t tmp[SIZE*(SIZE+5)];\
2271 uint8_t halfV[SIZE*SIZE];\
2272 uint8_t halfHV[SIZE*SIZE];\
2273 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2274 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2275 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2276 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2277 }\
2278 \
2279 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2280 uint8_t full[SIZE*(SIZE+5)];\
2281 uint8_t * const full_mid= full + SIZE*2;\
2282 int16_t tmp[SIZE*(SIZE+5)];\
2283 uint8_t halfV[SIZE*SIZE];\
2284 uint8_t halfHV[SIZE*SIZE];\
2285 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2286 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2287 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2288 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2289 }\
2290
2291 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2292 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2293 #define op_put(a, b) a = cm[((b) + 16)>>5]
2294 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2295 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2296
2297 H264_LOWPASS(put_ , op_put, op2_put)
2298 H264_LOWPASS(avg_ , op_avg, op2_avg)
2299 H264_MC(put_, 4)
2300 H264_MC(put_, 8)
2301 H264_MC(put_, 16)
2302 H264_MC(avg_, 4)
2303 H264_MC(avg_, 8)
2304 H264_MC(avg_, 16)
2305
2306 #undef op_avg
2307 #undef op_put
2308 #undef op2_avg
2309 #undef op2_put
2310 #endif
2311
2312 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2313 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2314 int i;
2315
2316 for(i=0; i<h; i++){
2317 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2318 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2319 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2320 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2321 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2322 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2323 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2324 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2325 dst+=dstStride;
2326 src+=srcStride;
2327 }
2328 }
2329
2330 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2331 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2332 int i;
2333
2334 for(i=0; i<w; i++){
2335 const int src_1= src[ -srcStride];
2336 const int src0 = src[0 ];
2337 const int src1 = src[ srcStride];
2338 const int src2 = src[2*srcStride];
2339 const int src3 = src[3*srcStride];
2340 const int src4 = src[4*srcStride];
2341 const int src5 = src[5*srcStride];
2342 const int src6 = src[6*srcStride];
2343 const int src7 = src[7*srcStride];
2344 const int src8 = src[8*srcStride];
2345 const int src9 = src[9*srcStride];
2346 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2347 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2348 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2349 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2350 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2351 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2352 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2353 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2354 src++;
2355 dst++;
2356 }
2357 }
2358
2359 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2360 put_pixels8_c(dst, src, stride, 8);
2361 }
2362
2363 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2364 uint8_t half[64];
2365 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2366 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2367 }
2368
2369 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2370 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2371 }
2372
2373 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2374 uint8_t half[64];
2375 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2376 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2377 }
2378
2379 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2380 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2381 }
2382
2383 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2384 uint8_t halfH[88];
2385 uint8_t halfV[64];
2386 uint8_t halfHV[64];
2387 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2388 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2389 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2390 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2391 }
2392 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2393 uint8_t halfH[88];
2394 uint8_t halfV[64];
2395 uint8_t halfHV[64];
2396 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2397 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2398 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2399 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2400 }
2401 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2402 uint8_t halfH[88];
2403 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2404 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2405 }
2406
2407 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2408 int x;
2409 const int strength= ff_h263_loop_filter_strength[qscale];
2410
2411 for(x=0; x<8; x++){
2412 int d1, d2, ad1;
2413 int p0= src[x-2*stride];
2414 int p1= src[x-1*stride];
2415 int p2= src[x+0*stride];
2416 int p3= src[x+1*stride];
2417 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2418
2419 if (d<-2*strength) d1= 0;
2420 else if(d<- strength) d1=-2*strength - d;
2421 else if(d< strength) d1= d;
2422 else if(d< 2*strength) d1= 2*strength - d;
2423 else d1= 0;
2424
2425 p1 += d1;
2426 p2 -= d1;
2427 if(p1&256) p1= ~(p1>>31);
2428 if(p2&256) p2= ~(p2>>31);
2429
2430 src[x-1*stride] = p1;
2431 src[x+0*stride] = p2;
2432
2433 ad1= ABS(d1)>>1;
2434
2435 d2= clip((p0-p3)/4, -ad1, ad1);
2436
2437 src[x-2*stride] = p0 - d2;
2438 src[x+ stride] = p3 + d2;
2439 }
2440 }
2441
2442 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2443 int y;
2444 const int strength= ff_h263_loop_filter_strength[qscale];
2445
2446 for(y=0; y<8; y++){
2447 int d1, d2, ad1;
2448 int p0= src[y*stride-2];
2449 int p1= src[y*stride-1];
2450 int p2= src[y*stride+0];
2451 int p3= src[y*stride+1];
2452 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2453
2454 if (d<-2*strength) d1= 0;
2455 else if(d<- strength) d1=-2*strength - d;
2456 else if(d< strength) d1= d;
2457 else if(d< 2*strength) d1= 2*strength - d;
2458 else d1= 0;
2459
2460 p1 += d1;
2461 p2 -= d1;
2462 if(p1&256) p1= ~(p1>>31);
2463 if(p2&256) p2= ~(p2>>31);
2464
2465 src[y*stride-1] = p1;
2466 src[y*stride+0] = p2;
2467
2468 ad1= ABS(d1)>>1;
2469
2470 d2= clip((p0-p3)/4, -ad1, ad1);
2471
2472 src[y*stride-2] = p0 - d2;
2473 src[y*stride+1] = p3 + d2;
2474 }
2475 }
2476
2477 static void h261_loop_filter_c(uint8_t *src, int stride){
2478 int x,y,xy,yz;
2479 int temp[64];
2480
2481 for(x=0; x<8; x++){
2482 temp[x ] = 4*src[x ];
2483 temp[x + 7*8] = 4*src[x + 7*stride];
2484 }
2485 for(y=1; y<7; y++){
2486 for(x=0; x<8; x++){
2487 xy = y * stride + x;
2488 yz = y * 8 + x;
2489 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2490 }
2491 }
2492
2493 for(y=0; y<8; y++){
2494 src[ y*stride] = (temp[ y*8] + 2)>>2;
2495 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2496 for(x=1; x<7; x++){
2497 xy = y * stride + x;
2498 yz = y * 8 + x;
2499 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2500 }
2501 }
2502 }
2503
2504 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2505 {
2506 int s, i;
2507
2508 s = 0;
2509 for(i=0;i<h;i++) {
2510 s += abs(pix1[0] - pix2[0]);
2511 s += abs(pix1[1] - pix2[1]);
2512 s += abs(pix1[2] - pix2[2]);
2513 s += abs(pix1[3] - pix2[3]);
2514 s += abs(pix1[4] - pix2[4]);
2515 s += abs(pix1[5] - pix2[5]);
2516 s += abs(pix1[6] - pix2[6]);
2517 s += abs(pix1[7] - pix2[7]);
2518 s += abs(pix1[8] - pix2[8]);
2519 s += abs(pix1[9] - pix2[9]);
2520 s += abs(pix1[10] - pix2[10]);
2521 s += abs(pix1[11] - pix2[11]);
2522 s += abs(pix1[12] - pix2[12]);
2523 s += abs(pix1[13] - pix2[13]);
2524 s += abs(pix1[14] - pix2[14]);
2525 s += abs(pix1[15] - pix2[15]);
2526 pix1 += line_size;
2527 pix2 += line_size;
2528 }
2529 return s;
2530 }
2531
2532 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2533 {
2534 int s, i;
2535
2536 s = 0;
2537 for(i=0;i<h;i++) {
2538 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2539 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2540 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2541 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2542 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2543 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2544 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2545 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2546 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2547 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2548 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2549 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2550 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2551 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2552 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2553 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2554 pix1 += line_size;
2555 pix2 += line_size;
2556 }
2557 return s;
2558 }
2559
2560 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2561 {
2562 int s, i;
2563 uint8_t *pix3 = pix2 + line_size;
2564
2565 s = 0;
2566 for(i=0;i<h;i++) {
2567 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2568 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2569 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2570 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2571 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2572 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2573 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2574 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2575 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2576 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2577 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2578 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2579 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2580 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2581 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2582 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2583 pix1 += line_size;
2584 pix2 += line_size;
2585 pix3 += line_size;
2586 }
2587 return s;
2588 }
2589
2590 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2591 {
2592 int s, i;
2593 uint8_t *pix3 = pix2 + line_size;
2594
2595 s = 0;
2596 for(i=0;i<h;i++) {
2597 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2598 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2599 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2600 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2601 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2602 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2603 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2604 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2605 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2606 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2607 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2608 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2609 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2610 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2611 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2612 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2613 pix1 += line_size;
2614 pix2 += line_size;
2615 pix3 += line_size;
2616 }
2617 return s;
2618 }
2619
2620 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2621 {
2622 int s, i;
2623
2624 s = 0;
2625 for(i=0;i<h;i++) {
2626 s += abs(pix1[0] - pix2[0]);
2627 s += abs(pix1[1] - pix2[1]);
2628 s += abs(pix1[2] - pix2[2]);
2629 s += abs(pix1[3] - pix2[3]);
2630 s += abs(pix1[4] - pix2[4]);
2631 s += abs(pix1[5] - pix2[5]);
2632 s += abs(pix1[6] - pix2[6]);
2633 s += abs(pix1[7] - pix2[7]);
2634 pix1 += line_size;
2635 pix2 += line_size;
2636 }
2637 return s;
2638 }
2639
2640 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2641 {
2642 int s, i;
2643
2644 s = 0;
2645 for(i=0;i<h;i++) {
2646 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2647 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2648 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2649 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2650 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2651 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2652 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2653 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2654 pix1 += line_size;
2655 pix2 += line_size;
2656 }
2657 return s;
2658 }
2659
2660 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2661 {
2662 int s, i;
2663 uint8_t *pix3 = pix2 + line_size;
2664
2665 s = 0;
2666 for(i=0;i<h;i++) {
2667 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2668 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2669 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2670 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2671 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2672 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2673 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2674 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2675 pix1 += line_size;
2676 pix2 += line_size;
2677 pix3 += line_size;
2678 }
2679 return s;
2680 }
2681
2682 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2683 {
2684 int s, i;
2685 uint8_t *pix3 = pix2 + line_size;
2686
2687 s = 0;
2688 for(i=0;i<h;i++) {
2689 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2690 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2691 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2692 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2693 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2694 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2695 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2696 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2697 pix1 += line_size;
2698 pix2 += line_size;
2699 pix3 += line_size;
2700 }
2701 return s;
2702 }
2703
2704 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2705 int score1=0;
2706 int score2=0;
2707 int x,y;
2708
2709 for(y=0; y<h; y++){
2710 for(x=0; x<16; x++){
2711 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2712 }
2713 if(y+1<h){
2714 for(x=0; x<15; x++){
2715 score2+= ABS( s1[x ] - s1[x +stride]
2716 - s1[x+1] + s1[x+1+stride])
2717 -ABS( s2[x ] - s2[x +stride]
2718 - s2[x+1] + s2[x+1+stride]);
2719 }
2720 }
2721 s1+= stride;
2722 s2+= stride;
2723 }
2724
2725 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2726 else return score1 + ABS(score2)*8;
2727 }
2728
2729 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2730 int score1=0;
2731 int score2=0;
2732 int x,y;
2733
2734 for(y=0; y<h; y++){
2735 for(x=0; x<8; x++){
2736 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2737 }
2738 if(y+1<h){
2739 for(x=0; x<7; x++){
2740 score2+= ABS( s1[x ] - s1[x +stride]
2741 - s1[x+1] + s1[x+1+stride])
2742 -ABS( s2[x ] - s2[x +stride]
2743 - s2[x+1] + s2[x+1+stride]);
2744 }
2745 }
2746 s1+= stride;
2747 s2+= stride;
2748 }
2749
2750 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2751 else return score1 + ABS(score2)*8;
2752 }
2753
2754 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2755 int i;
2756 unsigned int sum=0;
2757
2758 for(i=0; i<8*8; i++){
2759 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2760 int w= weight[i];
2761 b>>= RECON_SHIFT;
2762 assert(-512<b && b<512);
2763
2764 sum += (w*b)*(w*b)>>4;
2765 }
2766 return sum>>2;
2767 }
2768
2769 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2770 int i;
2771
2772 for(i=0; i<8*8; i++){
2773 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2774 }
2775 }
2776
2777 /**
2778 * permutes an 8x8 block.
2779 * @param block the block which will be permuted according to the given permutation vector
2780 * @param permutation the permutation vector
2781 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2782 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2783 * (inverse) permutated to scantable order!
2784 */
2785 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2786 {
2787 int i;
2788 DCTELEM temp[64];
2789
2790 if(last<=0) return;
2791 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2792
2793 for(i=0; i<=last; i++){
2794 const int j= scantable[i];
2795 temp[j]= block[j];
2796 block[j]=0;
2797 }
2798
2799 for(i=0; i<=last; i++){
2800 const int j= scantable[i];
2801 const int perm_j= permutation[j];
2802 block[perm_j]= temp[j];
2803 }
2804 }
2805
2806 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2807 return 0;
2808 }
2809
2810 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2811 int i;
2812
2813 memset(cmp, 0, sizeof(void*)*5);
2814
2815 for(i=0; i<5; i++){
2816 switch(type&0xFF){
2817 case FF_CMP_SAD:
2818 cmp[i]= c->sad[i];
2819 break;
2820 case FF_CMP_SATD:
2821 cmp[i]= c->hadamard8_diff[i];
2822 break;
2823 case FF_CMP_SSE:
2824 cmp[i]= c->sse[i];
2825 break;
2826 case FF_CMP_DCT:
2827 cmp[i]= c->dct_sad[i];
2828 break;
2829 case FF_CMP_PSNR:
2830 cmp[i]= c->quant_psnr[i];
2831 break;
2832 case FF_CMP_BIT:
2833 cmp[i]= c->bit[i];
2834 break;
2835 case FF_CMP_RD:
2836 cmp[i]= c->rd[i];
2837 break;
2838 case FF_CMP_VSAD:
2839 cmp[i]= c->vsad[i];
2840 break;
2841 case FF_CMP_VSSE:
2842 cmp[i]= c->vsse[i];
2843 break;
2844 case FF_CMP_ZERO:
2845 cmp[i]= zero_cmp;
2846 break;
2847 case FF_CMP_NSSE:
2848 cmp[i]= c->nsse[i];
2849 break;
2850 case FF_CMP_W53:
2851 cmp[i]= c->w53[i];
2852 break;
2853 case FF_CMP_W97:
2854 cmp[i]= c->w97[i];
2855 break;
2856 default:
2857 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2858 }
2859 }
2860 }
2861
2862 /**
2863 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2864 */
2865 static void clear_blocks_c(DCTELEM *blocks)
2866 {
2867 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2868 }
2869
2870 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2871 int i;
2872 for(i=0; i+7<w; i+=8){
2873 dst[i+0] += src[i+0];
2874 dst[i+1] += src[i+1];
2875 dst[i+2] += src[i+2];
2876 dst[i+3] += src[i+3];
2877 dst[i+4] += src[i+4];
2878 dst[i+5] += src[i+5];
2879 dst[i+6] += src[i+6];
2880 dst[i+7] += src[i+7];
2881 }
2882 for(; i<w; i++)
2883 dst[i+0] += src[i+0];
2884 }
2885
2886 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2887 int i;
2888 for(i=0; i+7<w; i+=8){
2889 dst[i+0] = src1[i+0]-src2[i+0];
2890 dst[i+1] = src1[i+1]-src2[i+1];
2891 dst[i+2] = src1[i+2]-src2[i+2];
2892 dst[i+3] = src1[i+3]-src2[i+3];
2893 dst[i+4] = src1[i+4]-src2[i+4];
2894 dst[i+5] = src1[i+5]-src2[i+5];
2895 dst[i+6] = src1[i+6]-src2[i+6];
2896 dst[i+7] = src1[i+7]-src2[i+7];
2897 }
2898 for(; i<w; i++)
2899 dst[i+0] = src1[i+0]-src2[i+0];
2900 }
2901
2902 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2903 int i;
2904 uint8_t l, lt;
2905
2906 l= *left;
2907 lt= *left_top;
2908
2909 for(i=0; i<w; i++){
2910 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2911 lt= src1[i];
2912 l= src2[i];
2913 dst[i]= l - pred;
2914 }
2915
2916 *left= l;
2917 *left_top= lt;
2918 }
2919
2920 #define BUTTERFLY2(o1,o2,i1,i2) \
2921 o1= (i1)+(i2);\
2922 o2= (i1)-(i2);
2923
2924 #define BUTTERFLY1(x,y) \
2925 {\
2926 int a,b;\
2927 a= x;\
2928 b= y;\
2929 x= a+b;\
2930 y= a-b;\
2931 }
2932
2933 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2934
2935 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2936 int i;
2937 int temp[64];
2938 int sum=0;
2939
2940 assert(h==8);
2941
2942 for(i=0; i<8; i++){
2943 //FIXME try pointer walks
2944 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2945 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2946 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2947 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2948
2949 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2950 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2951 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2952 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2953
2954 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2955 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2956 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2957 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2958 }
2959
2960 for(i=0; i<8; i++){
2961 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2962 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2963 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2964 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2965
2966 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2967 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2968 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2969 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2970
2971 sum +=
2972 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2973 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2974 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2975 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2976 }
2977 #if 0
2978 static int maxi=0;
2979 if(sum>maxi){
2980 maxi=sum;
2981 printf("MAX:%d\n", maxi);
2982 }
2983 #endif
2984 return sum;
2985 }
2986
2987 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2988 int i;
2989 int temp[64];
2990 int sum=0;
2991
2992 assert(h==8);
2993
2994 for(i=0; i<8; i++){
2995 //FIXME try pointer walks
2996 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2997 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2998 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2999 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3000
3001 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3002 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3003 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3004 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3005
3006 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3007 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3008 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3009 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3010 }
3011
3012 for(i=0; i<8; i++){
3013 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3014 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3015 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3016 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3017
3018 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3019 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3020 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3021 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3022
3023 sum +=
3024 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3025 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3026 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3027 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3028 }
3029
3030 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3031
3032 return sum;
3033 }
3034
3035 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3036 MpegEncContext * const s= (MpegEncContext *)c;
3037 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3038 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3039 int sum=0, i;
3040
3041 assert(h==8);
3042
3043 s->dsp.diff_pixels(temp, src1, src2, stride);
3044 s->dsp.fdct(temp);
3045
3046 for(i=0; i<64; i++)
3047 sum+= ABS(temp[i]);
3048
3049 return sum;
3050 }
3051
3052 void simple_idct(DCTELEM *block); //FIXME
3053
3054 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3055 MpegEncContext * const s= (MpegEncContext *)c;
3056 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3057 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3058 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3059 int sum=0, i;
3060
3061 assert(h==8);
3062 s->mb_intra=0;
3063
3064 s->dsp.diff_pixels(temp, src1, src2, stride);
3065
3066 memcpy(bak, temp, 64*sizeof(DCTELEM));
3067
3068 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3069 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3070 simple_idct(temp); //FIXME
3071
3072 for(i=0; i<64; i++)
3073 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3074
3075 return sum;
3076 }
3077
3078 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3079 MpegEncContext * const s= (MpegEncContext *)c;
3080 const uint8_t *scantable= s->intra_scantable.permutated;
3081 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3082 uint64_t __align8 aligned_bak[stride];
3083 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3084 uint8_t * const bak= (uint8_t*)aligned_bak;
3085 int i, last, run, bits, level, distoration, start_i;
3086 const int esc_length= s->ac_esc_length;
3087 uint8_t * length;
3088 uint8_t * last_length;
3089
3090 assert(h==8);
3091
3092 for(i=0; i<8; i++){
3093 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3094 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3095 }
3096
3097 s->dsp.diff_pixels(temp, src1, src2, stride);
3098
3099 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3100
3101 bits=0;
3102
3103 if (s->mb_intra) {
3104 start_i = 1;
3105 length = s->intra_ac_vlc_length;
3106 last_length= s->intra_ac_vlc_last_length;
3107 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3108 } else {
3109 start_i = 0;
3110 length = s->inter_ac_vlc_length;
3111 last_length= s->inter_ac_vlc_last_length;
3112 }
3113
3114 if(last>=start_i){
3115 run=0;
3116 for(i=start_i; i<last; i++){
3117 int j= scantable[i];
3118 level= temp[j];
3119
3120 if(level){
3121 level+=64;
3122 if((level&(~127)) == 0){
3123 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3124 }else
3125 bits+= esc_length;
3126 run=0;
3127 }else
3128 run++;
3129 }
3130 i= scantable[last];
3131
3132 level= temp[i] + 64;
3133
3134 assert(level - 64);
3135
3136 if((level&(~127)) == 0){
3137 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3138 }else
3139 bits+= esc_length;
3140
3141 }
3142
3143 if(last>=0){
3144 if(s->mb_intra)
3145 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3146 else
3147 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3148 }
3149
3150 s->dsp.idct_add(bak, stride, temp);
3151
3152 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3153
3154 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3155 }
3156
3157 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3158 MpegEncContext * const s= (MpegEncContext *)c;
3159 const uint8_t *scantable= s->intra_scantable.permutated;
3160 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3161 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3162 int i, last, run, bits, level, start_i;
3163 const int esc_length= s->ac_esc_length;
3164 uint8_t * length;
3165 uint8_t * last_length;
3166
3167 assert(h==8);
3168
3169 s->dsp.diff_pixels(temp, src1, src2, stride);
3170
3171 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3172
3173 bits=0;
3174
3175 if (s->mb_intra) {
3176 start_i = 1;
3177 length = s->intra_ac_vlc_length;
3178 last_length= s->intra_ac_vlc_last_length;
3179 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3180 } else {
3181 start_i = 0;
3182 length = s->inter_ac_vlc_length;
3183 last_length= s->inter_ac_vlc_last_length;
3184 }
3185
3186 if(last>=start_i){
3187 run=0;
3188 for(i=start_i; i<last; i++){
3189 int j= scantable[i];
3190 level= temp[j];
3191
3192 if(level){
3193 level+=64;
3194 if((level&(~127)) == 0){
3195 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3196 }else
3197 bits+= esc_length;
3198 run=0;
3199 }else
3200 run++;
3201 }
3202 i= scantable[last];
3203
3204 level= temp[i] + 64;
3205
3206 assert(level - 64);
3207
3208 if((level&(~127)) == 0){
3209 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3210 }else
3211 bits+= esc_length;
3212 }
3213
3214 return bits;
3215 }
3216
3217 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3218 int score=0;
3219 int x,y;
3220
3221 for(y=1; y<h; y++){
3222 for(x=0; x<16; x+=4){
3223 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3224 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3225 }
3226 s+= stride;
3227 }
3228
3229 return score;
3230 }
3231
3232 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3233 int score=0;
3234 int x,y;
3235
3236 for(y=1; y<h; y++){
3237 for(x=0; x<16; x++){
3238 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3239 }
3240 s1+= stride;
3241 s2+= stride;
3242 }
3243
3244 return score;
3245 }
3246
3247 #define SQ(a) ((a)*(a))
3248 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3249 int score=0;
3250 int x,y;
3251
3252 for(y=1; y<h; y++){
3253 for(x=0; x<16; x+=4){
3254 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3255 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3256 }
3257 s+= stride;
3258 }
3259
3260 return score;
3261 }
3262
3263 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3264 int score=0;
3265 int x,y;
3266
3267 for(y=1; y<h; y++){
3268 for(x=0; x<16; x++){
3269 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3270 }
3271 s1+= stride;
3272 s2+= stride;
3273 }
3274
3275 return score;
3276 }
3277
3278 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3279 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3280 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3281 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3282 WARPER8_16_SQ(rd8x8_c, rd16_c)
3283 WARPER8_16_SQ(bit8x8_c, bit16_c)
3284
3285 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3286 converted */
3287 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3288 {
3289 j_rev_dct (block);
3290 put_pixels_clamped_c(block, dest, line_size);
3291 }
3292 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3293 {
3294 j_rev_dct (block);
3295 add_pixels_clamped_c(block, dest, line_size);
3296 }
3297
3298 /* init static data */
3299 void dsputil_static_init(void)
3300 {
3301 int i;
3302
3303 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3304 for(i=0;i<MAX_NEG_CROP;i++) {
3305 cropTbl[i] = 0;
3306 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3307 }
3308
3309 for(i=0;i<512;i++) {
3310 squareTbl[i] = (i - 256) * (i - 256);
3311 }
3312
3313 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3314 }
3315
3316
3317 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3318 {
3319 int i;
3320
3321 #ifdef CONFIG_ENCODERS
3322 if(avctx->dct_algo==FF_DCT_FASTINT) {
3323 c->fdct = fdct_ifast;
3324 c->fdct248 = fdct_ifast248;
3325 }
3326 else if(avctx->dct_algo==FF_DCT_FAAN) {
3327 c->fdct = ff_faandct;
3328 c->fdct248 = ff_faandct248;
3329 }
3330 else {
3331 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3332 c->fdct248 = ff_fdct248_islow;
3333 }
3334 #endif //CONFIG_ENCODERS
3335
3336 if(avctx->idct_algo==FF_IDCT_INT){
3337 c->idct_put= ff_jref_idct_put;
3338 c->idct_add= ff_jref_idct_add;
3339 c->idct = j_rev_dct;
3340 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3341 }else{ //accurate/default
3342 c->idct_put= simple_idct_put;
3343 c->idct_add= simple_idct_add;
3344 c->idct = simple_idct;
3345 c->idct_permutation_type= FF_NO_IDCT_PERM;
3346 }
3347
3348 /* VP3 DSP support */
3349 c->vp3_dsp_init = vp3_dsp_init_c;
3350 c->vp3_idct = vp3_idct_c;
3351
3352 c->get_pixels = get_pixels_c;
3353 c->diff_pixels = diff_pixels_c;
3354 c->put_pixels_clamped = put_pixels_clamped_c;
3355 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3356 c->add_pixels_clamped = add_pixels_clamped_c;
3357 c->gmc1 = gmc1_c;
3358 c->gmc = gmc_c;
3359 c->clear_blocks = clear_blocks_c;
3360 c->pix_sum = pix_sum_c;
3361 c->pix_norm1 = pix_norm1_c;
3362
3363 /* TODO [0] 16 [1] 8 */
3364 c->pix_abs[0][0] = pix_abs16_c;
3365 c->pix_abs[0][1] = pix_abs16_x2_c;
3366 c->pix_abs[0][2] = pix_abs16_y2_c;
3367 c->pix_abs[0][3] = pix_abs16_xy2_c;
3368 c->pix_abs[1][0] = pix_abs8_c;
3369 c->pix_abs[1][1] = pix_abs8_x2_c;
3370 c->pix_abs[1][2] = pix_abs8_y2_c;
3371 c->pix_abs[1][3] = pix_abs8_xy2_c;
3372
3373 #define dspfunc(PFX, IDX, NUM) \
3374 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3375 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3376 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3377 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3378
3379 dspfunc(put, 0, 16);
3380 dspfunc(put_no_rnd, 0, 16);
3381 dspfunc(put, 1, 8);
3382 dspfunc(put_no_rnd, 1, 8);
3383 dspfunc(put, 2, 4);
3384 dspfunc(put, 3, 2);
3385
3386 dspfunc(avg, 0, 16);
3387 dspfunc(avg_no_rnd, 0, 16);
3388 dspfunc(avg, 1, 8);
3389 dspfunc(avg_no_rnd, 1, 8);
3390 dspfunc(avg, 2, 4);
3391 dspfunc(avg, 3, 2);
3392 #undef dspfunc
3393
3394 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3395 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3396
3397 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3398 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3399 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3400 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3401 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3402 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3403 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3404 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3405 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3406
3407 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3408 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3409 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3410 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3411 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3412 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3413 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3414 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3415 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3416
3417 #define dspfunc(PFX, IDX, NUM) \
3418 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3419 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3420 c->PFX ## _pixels_tab