MMX for H.264 deblocking filter
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
32 #include "faandct.h"
33
34 /* snow.c */
35 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
36
37 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
38 uint32_t squareTbl[512] = {0, };
39
40 const uint8_t ff_zigzag_direct[64] = {
41 0, 1, 8, 16, 9, 2, 3, 10,
42 17, 24, 32, 25, 18, 11, 4, 5,
43 12, 19, 26, 33, 40, 48, 41, 34,
44 27, 20, 13, 6, 7, 14, 21, 28,
45 35, 42, 49, 56, 57, 50, 43, 36,
46 29, 22, 15, 23, 30, 37, 44, 51,
47 58, 59, 52, 45, 38, 31, 39, 46,
48 53, 60, 61, 54, 47, 55, 62, 63
49 };
50
51 /* Specific zigzag scan for 248 idct. NOTE that unlike the
52 specification, we interleave the fields */
53 const uint8_t ff_zigzag248_direct[64] = {
54 0, 8, 1, 9, 16, 24, 2, 10,
55 17, 25, 32, 40, 48, 56, 33, 41,
56 18, 26, 3, 11, 4, 12, 19, 27,
57 34, 42, 49, 57, 50, 58, 35, 43,
58 20, 28, 5, 13, 6, 14, 21, 29,
59 36, 44, 51, 59, 52, 60, 37, 45,
60 22, 30, 7, 15, 23, 31, 38, 46,
61 53, 61, 54, 62, 39, 47, 55, 63,
62 };
63
64 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
65 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
66
67 const uint8_t ff_alternate_horizontal_scan[64] = {
68 0, 1, 2, 3, 8, 9, 16, 17,
69 10, 11, 4, 5, 6, 7, 15, 14,
70 13, 12, 19, 18, 24, 25, 32, 33,
71 26, 27, 20, 21, 22, 23, 28, 29,
72 30, 31, 34, 35, 40, 41, 48, 49,
73 42, 43, 36, 37, 38, 39, 44, 45,
74 46, 47, 50, 51, 56, 57, 58, 59,
75 52, 53, 54, 55, 60, 61, 62, 63,
76 };
77
78 const uint8_t ff_alternate_vertical_scan[64] = {
79 0, 8, 16, 24, 1, 9, 2, 10,
80 17, 25, 32, 40, 48, 56, 57, 49,
81 41, 33, 26, 18, 3, 11, 4, 12,
82 19, 27, 34, 42, 50, 58, 35, 43,
83 51, 59, 20, 28, 5, 13, 6, 14,
84 21, 29, 36, 44, 52, 60, 37, 45,
85 53, 61, 22, 30, 7, 15, 23, 31,
86 38, 46, 54, 62, 39, 47, 55, 63,
87 };
88
89 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
90 const uint32_t inverse[256]={
91 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
92 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
93 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
94 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
95 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
96 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
97 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
98 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
99 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
100 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
101 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
102 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
103 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
104 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
105 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
106 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
107 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
108 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
109 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
110 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
111 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
112 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
113 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
114 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
115 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
116 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
117 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
118 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
119 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
120 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
121 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
122 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
123 };
124
125 /* Input permutation for the simple_idct_mmx */
126 static const uint8_t simple_mmx_permutation[64]={
127 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
128 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
129 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
130 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
131 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
132 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
133 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
134 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
135 };
136
137 static int pix_sum_c(uint8_t * pix, int line_size)
138 {
139 int s, i, j;
140
141 s = 0;
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
144 s += pix[0];
145 s += pix[1];
146 s += pix[2];
147 s += pix[3];
148 s += pix[4];
149 s += pix[5];
150 s += pix[6];
151 s += pix[7];
152 pix += 8;
153 }
154 pix += line_size - 16;
155 }
156 return s;
157 }
158
159 static int pix_norm1_c(uint8_t * pix, int line_size)
160 {
161 int s, i, j;
162 uint32_t *sq = squareTbl + 256;
163
164 s = 0;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
167 #if 0
168 s += sq[pix[0]];
169 s += sq[pix[1]];
170 s += sq[pix[2]];
171 s += sq[pix[3]];
172 s += sq[pix[4]];
173 s += sq[pix[5]];
174 s += sq[pix[6]];
175 s += sq[pix[7]];
176 #else
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
179 s += sq[x&0xff];
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
187 #else
188 register uint32_t x=*(uint32_t*)pix;
189 s += sq[x&0xff];
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
194 s += sq[x&0xff];
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
198 #endif
199 #endif
200 pix += 8;
201 }
202 pix += line_size - 16;
203 }
204 return s;
205 }
206
207 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
208 int i;
209
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= bswap_32(src[i+0]);
212 dst[i+1]= bswap_32(src[i+1]);
213 dst[i+2]= bswap_32(src[i+2]);
214 dst[i+3]= bswap_32(src[i+3]);
215 dst[i+4]= bswap_32(src[i+4]);
216 dst[i+5]= bswap_32(src[i+5]);
217 dst[i+6]= bswap_32(src[i+6]);
218 dst[i+7]= bswap_32(src[i+7]);
219 }
220 for(;i<w; i++){
221 dst[i+0]= bswap_32(src[i+0]);
222 }
223 }
224
225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
226 {
227 int s, i;
228 uint32_t *sq = squareTbl + 256;
229
230 s = 0;
231 for (i = 0; i < h; i++) {
232 s += sq[pix1[0] - pix2[0]];
233 s += sq[pix1[1] - pix2[1]];
234 s += sq[pix1[2] - pix2[2]];
235 s += sq[pix1[3] - pix2[3]];
236 pix1 += line_size;
237 pix2 += line_size;
238 }
239 return s;
240 }
241
242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243 {
244 int s, i;
245 uint32_t *sq = squareTbl + 256;
246
247 s = 0;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 s += sq[pix1[4] - pix2[4]];
254 s += sq[pix1[5] - pix2[5]];
255 s += sq[pix1[6] - pix2[6]];
256 s += sq[pix1[7] - pix2[7]];
257 pix1 += line_size;
258 pix2 += line_size;
259 }
260 return s;
261 }
262
263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
264 {
265 int s, i;
266 uint32_t *sq = squareTbl + 256;
267
268 s = 0;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[ 0] - pix2[ 0]];
271 s += sq[pix1[ 1] - pix2[ 1]];
272 s += sq[pix1[ 2] - pix2[ 2]];
273 s += sq[pix1[ 3] - pix2[ 3]];
274 s += sq[pix1[ 4] - pix2[ 4]];
275 s += sq[pix1[ 5] - pix2[ 5]];
276 s += sq[pix1[ 6] - pix2[ 6]];
277 s += sq[pix1[ 7] - pix2[ 7]];
278 s += sq[pix1[ 8] - pix2[ 8]];
279 s += sq[pix1[ 9] - pix2[ 9]];
280 s += sq[pix1[10] - pix2[10]];
281 s += sq[pix1[11] - pix2[11]];
282 s += sq[pix1[12] - pix2[12]];
283 s += sq[pix1[13] - pix2[13]];
284 s += sq[pix1[14] - pix2[14]];
285 s += sq[pix1[15] - pix2[15]];
286
287 pix1 += line_size;
288 pix2 += line_size;
289 }
290 return s;
291 }
292
293
294 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
295 int s, i, j;
296 const int dec_count= w==8 ? 3 : 4;
297 int tmp[16*16];
298 #if 0
299 int level, ori;
300 static const int scale[2][2][4][4]={
301 {
302 {
303 //8x8 dec=3
304 {268, 239, 239, 213},
305 { 0, 224, 224, 152},
306 { 0, 135, 135, 110},
307 },{
308 //16x16 dec=4
309 {344, 310, 310, 280},
310 { 0, 320, 320, 228},
311 { 0, 175, 175, 136},
312 { 0, 129, 129, 102},
313 }
314 },{
315 {//FIXME 5/3
316 //8x8 dec=3
317 {275, 245, 245, 218},
318 { 0, 230, 230, 156},
319 { 0, 138, 138, 113},
320 },{
321 //16x16 dec=4
322 {352, 317, 317, 286},
323 { 0, 328, 328, 233},
324 { 0, 180, 180, 140},
325 { 0, 132, 132, 105},
326 }
327 }
328 };
329 #endif
330
331 for (i = 0; i < h; i++) {
332 for (j = 0; j < w; j+=4) {
333 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
334 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
335 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
336 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
337 }
338 pix1 += line_size;
339 pix2 += line_size;
340 }
341 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
342
343 s=0;
344 #if 0
345 for(level=0; level<dec_count; level++){
346 for(ori= level ? 1 : 0; ori<4; ori++){
347 int sx= (ori&1) ? 1<<level: 0;
348 int stride= 16<<(dec_count-level);
349 int sy= (ori&2) ? stride>>1 : 0;
350 int size= 1<<level;
351
352 for(i=0; i<size; i++){
353 for(j=0; j<size; j++){
354 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
355 s += ABS(v);
356 }
357 }
358 }
359 }
360 #endif
361 for (i = 0; i < h; i++) {
362 for (j = 0; j < w; j+=4) {
363 s+= ABS(tmp[16*i+j+0]);
364 s+= ABS(tmp[16*i+j+1]);
365 s+= ABS(tmp[16*i+j+2]);
366 s+= ABS(tmp[16*i+j+3]);
367 }
368 }
369 assert(s>=0);
370
371 return s>>2;
372 }
373
374 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 1);
376 }
377
378 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 0);
380 }
381
382 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 1);
384 }
385
386 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 0);
388 }
389
390 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
391 {
392 int i;
393
394 /* read the pixels */
395 for(i=0;i<8;i++) {
396 block[0] = pixels[0];
397 block[1] = pixels[1];
398 block[2] = pixels[2];
399 block[3] = pixels[3];
400 block[4] = pixels[4];
401 block[5] = pixels[5];
402 block[6] = pixels[6];
403 block[7] = pixels[7];
404 pixels += line_size;
405 block += 8;
406 }
407 }
408
409 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
410 const uint8_t *s2, int stride){
411 int i;
412
413 /* read the pixels */
414 for(i=0;i<8;i++) {
415 block[0] = s1[0] - s2[0];
416 block[1] = s1[1] - s2[1];
417 block[2] = s1[2] - s2[2];
418 block[3] = s1[3] - s2[3];
419 block[4] = s1[4] - s2[4];
420 block[5] = s1[5] - s2[5];
421 block[6] = s1[6] - s2[6];
422 block[7] = s1[7] - s2[7];
423 s1 += stride;
424 s2 += stride;
425 block += 8;
426 }
427 }
428
429
430 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
431 int line_size)
432 {
433 int i;
434 uint8_t *cm = cropTbl + MAX_NEG_CROP;
435
436 /* read the pixels */
437 for(i=0;i<8;i++) {
438 pixels[0] = cm[block[0]];
439 pixels[1] = cm[block[1]];
440 pixels[2] = cm[block[2]];
441 pixels[3] = cm[block[3]];
442 pixels[4] = cm[block[4]];
443 pixels[5] = cm[block[5]];
444 pixels[6] = cm[block[6]];
445 pixels[7] = cm[block[7]];
446
447 pixels += line_size;
448 block += 8;
449 }
450 }
451
452 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
453 int line_size)
454 {
455 int i;
456 uint8_t *cm = cropTbl + MAX_NEG_CROP;
457
458 /* read the pixels */
459 for(i=0;i<4;i++) {
460 pixels[0] = cm[block[0]];
461 pixels[1] = cm[block[1]];
462 pixels[2] = cm[block[2]];
463 pixels[3] = cm[block[3]];
464
465 pixels += line_size;
466 block += 8;
467 }
468 }
469
470 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
471 int line_size)
472 {
473 int i;
474 uint8_t *cm = cropTbl + MAX_NEG_CROP;
475
476 /* read the pixels */
477 for(i=0;i<2;i++) {
478 pixels[0] = cm[block[0]];
479 pixels[1] = cm[block[1]];
480
481 pixels += line_size;
482 block += 8;
483 }
484 }
485
486 static void put_signed_pixels_clamped_c(const DCTELEM *block,
487 uint8_t *restrict pixels,
488 int line_size)
489 {
490 int i, j;
491
492 for (i = 0; i < 8; i++) {
493 for (j = 0; j < 8; j++) {
494 if (*block < -128)
495 *pixels = 0;
496 else if (*block > 127)
497 *pixels = 255;
498 else
499 *pixels = (uint8_t)(*block + 128);
500 block++;
501 pixels++;
502 }
503 pixels += (line_size - 8);
504 }
505 }
506
507 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
508 int line_size)
509 {
510 int i;
511 uint8_t *cm = cropTbl + MAX_NEG_CROP;
512
513 /* read the pixels */
514 for(i=0;i<8;i++) {
515 pixels[0] = cm[pixels[0] + block[0]];
516 pixels[1] = cm[pixels[1] + block[1]];
517 pixels[2] = cm[pixels[2] + block[2]];
518 pixels[3] = cm[pixels[3] + block[3]];
519 pixels[4] = cm[pixels[4] + block[4]];
520 pixels[5] = cm[pixels[5] + block[5]];
521 pixels[6] = cm[pixels[6] + block[6]];
522 pixels[7] = cm[pixels[7] + block[7]];
523 pixels += line_size;
524 block += 8;
525 }
526 }
527
528 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
529 int line_size)
530 {
531 int i;
532 uint8_t *cm = cropTbl + MAX_NEG_CROP;
533
534 /* read the pixels */
535 for(i=0;i<4;i++) {
536 pixels[0] = cm[pixels[0] + block[0]];
537 pixels[1] = cm[pixels[1] + block[1]];
538 pixels[2] = cm[pixels[2] + block[2]];
539 pixels[3] = cm[pixels[3] + block[3]];
540 pixels += line_size;
541 block += 8;
542 }
543 }
544
545 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
546 int line_size)
547 {
548 int i;
549 uint8_t *cm = cropTbl + MAX_NEG_CROP;
550
551 /* read the pixels */
552 for(i=0;i<2;i++) {
553 pixels[0] = cm[pixels[0] + block[0]];
554 pixels[1] = cm[pixels[1] + block[1]];
555 pixels += line_size;
556 block += 8;
557 }
558 }
559 #if 0
560
561 #define PIXOP2(OPNAME, OP) \
562 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
563 {\
564 int i;\
565 for(i=0; i<h; i++){\
566 OP(*((uint64_t*)block), LD64(pixels));\
567 pixels+=line_size;\
568 block +=line_size;\
569 }\
570 }\
571 \
572 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
573 {\
574 int i;\
575 for(i=0; i<h; i++){\
576 const uint64_t a= LD64(pixels );\
577 const uint64_t b= LD64(pixels+1);\
578 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
579 pixels+=line_size;\
580 block +=line_size;\
581 }\
582 }\
583 \
584 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
585 {\
586 int i;\
587 for(i=0; i<h; i++){\
588 const uint64_t a= LD64(pixels );\
589 const uint64_t b= LD64(pixels+1);\
590 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
591 pixels+=line_size;\
592 block +=line_size;\
593 }\
594 }\
595 \
596 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
597 {\
598 int i;\
599 for(i=0; i<h; i++){\
600 const uint64_t a= LD64(pixels );\
601 const uint64_t b= LD64(pixels+line_size);\
602 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
603 pixels+=line_size;\
604 block +=line_size;\
605 }\
606 }\
607 \
608 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
609 {\
610 int i;\
611 for(i=0; i<h; i++){\
612 const uint64_t a= LD64(pixels );\
613 const uint64_t b= LD64(pixels+line_size);\
614 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
615 pixels+=line_size;\
616 block +=line_size;\
617 }\
618 }\
619 \
620 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
621 {\
622 int i;\
623 const uint64_t a= LD64(pixels );\
624 const uint64_t b= LD64(pixels+1);\
625 uint64_t l0= (a&0x0303030303030303ULL)\
626 + (b&0x0303030303030303ULL)\
627 + 0x0202020202020202ULL;\
628 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
629 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
630 uint64_t l1,h1;\
631 \
632 pixels+=line_size;\
633 for(i=0; i<h; i+=2){\
634 uint64_t a= LD64(pixels );\
635 uint64_t b= LD64(pixels+1);\
636 l1= (a&0x0303030303030303ULL)\
637 + (b&0x0303030303030303ULL);\
638 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
639 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
640 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
641 pixels+=line_size;\
642 block +=line_size;\
643 a= LD64(pixels );\
644 b= LD64(pixels+1);\
645 l0= (a&0x0303030303030303ULL)\
646 + (b&0x0303030303030303ULL)\
647 + 0x0202020202020202ULL;\
648 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
649 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
650 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
651 pixels+=line_size;\
652 block +=line_size;\
653 }\
654 }\
655 \
656 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657 {\
658 int i;\
659 const uint64_t a= LD64(pixels );\
660 const uint64_t b= LD64(pixels+1);\
661 uint64_t l0= (a&0x0303030303030303ULL)\
662 + (b&0x0303030303030303ULL)\
663 + 0x0101010101010101ULL;\
664 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
665 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
666 uint64_t l1,h1;\
667 \
668 pixels+=line_size;\
669 for(i=0; i<h; i+=2){\
670 uint64_t a= LD64(pixels );\
671 uint64_t b= LD64(pixels+1);\
672 l1= (a&0x0303030303030303ULL)\
673 + (b&0x0303030303030303ULL);\
674 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
675 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
676 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
677 pixels+=line_size;\
678 block +=line_size;\
679 a= LD64(pixels );\
680 b= LD64(pixels+1);\
681 l0= (a&0x0303030303030303ULL)\
682 + (b&0x0303030303030303ULL)\
683 + 0x0101010101010101ULL;\
684 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
685 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
686 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
687 pixels+=line_size;\
688 block +=line_size;\
689 }\
690 }\
691 \
692 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
693 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
694 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
695 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
696 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
697 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
698 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
699
700 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
701 #else // 64 bit variant
702
703 #define PIXOP2(OPNAME, OP) \
704 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
705 int i;\
706 for(i=0; i<h; i++){\
707 OP(*((uint16_t*)(block )), LD16(pixels ));\
708 pixels+=line_size;\
709 block +=line_size;\
710 }\
711 }\
712 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
713 int i;\
714 for(i=0; i<h; i++){\
715 OP(*((uint32_t*)(block )), LD32(pixels ));\
716 pixels+=line_size;\
717 block +=line_size;\
718 }\
719 }\
720 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
721 int i;\
722 for(i=0; i<h; i++){\
723 OP(*((uint32_t*)(block )), LD32(pixels ));\
724 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
725 pixels+=line_size;\
726 block +=line_size;\
727 }\
728 }\
729 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
730 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
731 }\
732 \
733 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
734 int src_stride1, int src_stride2, int h){\
735 int i;\
736 for(i=0; i<h; i++){\
737 uint32_t a,b;\
738 a= LD32(&src1[i*src_stride1 ]);\
739 b= LD32(&src2[i*src_stride2 ]);\
740 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
741 a= LD32(&src1[i*src_stride1+4]);\
742 b= LD32(&src2[i*src_stride2+4]);\
743 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
744 }\
745 }\
746 \
747 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
748 int src_stride1, int src_stride2, int h){\
749 int i;\
750 for(i=0; i<h; i++){\
751 uint32_t a,b;\
752 a= LD32(&src1[i*src_stride1 ]);\
753 b= LD32(&src2[i*src_stride2 ]);\
754 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
755 a= LD32(&src1[i*src_stride1+4]);\
756 b= LD32(&src2[i*src_stride2+4]);\
757 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
758 }\
759 }\
760 \
761 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
762 int src_stride1, int src_stride2, int h){\
763 int i;\
764 for(i=0; i<h; i++){\
765 uint32_t a,b;\
766 a= LD32(&src1[i*src_stride1 ]);\
767 b= LD32(&src2[i*src_stride2 ]);\
768 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
769 }\
770 }\
771 \
772 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
773 int src_stride1, int src_stride2, int h){\
774 int i;\
775 for(i=0; i<h; i++){\
776 uint32_t a,b;\
777 a= LD16(&src1[i*src_stride1 ]);\
778 b= LD16(&src2[i*src_stride2 ]);\
779 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
780 }\
781 }\
782 \
783 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
784 int src_stride1, int src_stride2, int h){\
785 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
786 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
787 }\
788 \
789 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
790 int src_stride1, int src_stride2, int h){\
791 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
792 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
793 }\
794 \
795 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
796 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
797 }\
798 \
799 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
800 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
801 }\
802 \
803 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
804 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
805 }\
806 \
807 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
808 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
809 }\
810 \
811 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
812 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
813 int i;\
814 for(i=0; i<h; i++){\
815 uint32_t a, b, c, d, l0, l1, h0, h1;\
816 a= LD32(&src1[i*src_stride1]);\
817 b= LD32(&src2[i*src_stride2]);\
818 c= LD32(&src3[i*src_stride3]);\
819 d= LD32(&src4[i*src_stride4]);\
820 l0= (a&0x03030303UL)\
821 + (b&0x03030303UL)\
822 + 0x02020202UL;\
823 h0= ((a&0xFCFCFCFCUL)>>2)\
824 + ((b&0xFCFCFCFCUL)>>2);\
825 l1= (c&0x03030303UL)\
826 + (d&0x03030303UL);\
827 h1= ((c&0xFCFCFCFCUL)>>2)\
828 + ((d&0xFCFCFCFCUL)>>2);\
829 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
830 a= LD32(&src1[i*src_stride1+4]);\
831 b= LD32(&src2[i*src_stride2+4]);\
832 c= LD32(&src3[i*src_stride3+4]);\
833 d= LD32(&src4[i*src_stride4+4]);\
834 l0= (a&0x03030303UL)\
835 + (b&0x03030303UL)\
836 + 0x02020202UL;\
837 h0= ((a&0xFCFCFCFCUL)>>2)\
838 + ((b&0xFCFCFCFCUL)>>2);\
839 l1= (c&0x03030303UL)\
840 + (d&0x03030303UL);\
841 h1= ((c&0xFCFCFCFCUL)>>2)\
842 + ((d&0xFCFCFCFCUL)>>2);\
843 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
844 }\
845 }\
846 \
847 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
848 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
849 }\
850 \
851 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
853 }\
854 \
855 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
856 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
857 }\
858 \
859 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
860 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
861 }\
862 \
863 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
864 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
865 int i;\
866 for(i=0; i<h; i++){\
867 uint32_t a, b, c, d, l0, l1, h0, h1;\
868 a= LD32(&src1[i*src_stride1]);\
869 b= LD32(&src2[i*src_stride2]);\
870 c= LD32(&src3[i*src_stride3]);\
871 d= LD32(&src4[i*src_stride4]);\
872 l0= (a&0x03030303UL)\
873 + (b&0x03030303UL)\
874 + 0x01010101UL;\
875 h0= ((a&0xFCFCFCFCUL)>>2)\
876 + ((b&0xFCFCFCFCUL)>>2);\
877 l1= (c&0x03030303UL)\
878 + (d&0x03030303UL);\
879 h1= ((c&0xFCFCFCFCUL)>>2)\
880 + ((d&0xFCFCFCFCUL)>>2);\
881 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
882 a= LD32(&src1[i*src_stride1+4]);\
883 b= LD32(&src2[i*src_stride2+4]);\
884 c= LD32(&src3[i*src_stride3+4]);\
885 d= LD32(&src4[i*src_stride4+4]);\
886 l0= (a&0x03030303UL)\
887 + (b&0x03030303UL)\
888 + 0x01010101UL;\
889 h0= ((a&0xFCFCFCFCUL)>>2)\
890 + ((b&0xFCFCFCFCUL)>>2);\
891 l1= (c&0x03030303UL)\
892 + (d&0x03030303UL);\
893 h1= ((c&0xFCFCFCFCUL)>>2)\
894 + ((d&0xFCFCFCFCUL)>>2);\
895 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
896 }\
897 }\
898 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
899 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
900 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
901 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
902 }\
903 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
904 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
905 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
906 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
907 }\
908 \
909 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
910 {\
911 int i, a0, b0, a1, b1;\
912 a0= pixels[0];\
913 b0= pixels[1] + 2;\
914 a0 += b0;\
915 b0 += pixels[2];\
916 \
917 pixels+=line_size;\
918 for(i=0; i<h; i+=2){\
919 a1= pixels[0];\
920 b1= pixels[1];\
921 a1 += b1;\
922 b1 += pixels[2];\
923 \
924 block[0]= (a1+a0)>>2; /* FIXME non put */\
925 block[1]= (b1+b0)>>2;\
926 \
927 pixels+=line_size;\
928 block +=line_size;\
929 \
930 a0= pixels[0];\
931 b0= pixels[1] + 2;\
932 a0 += b0;\
933 b0 += pixels[2];\
934 \
935 block[0]= (a1+a0)>>2;\
936 block[1]= (b1+b0)>>2;\
937 pixels+=line_size;\
938 block +=line_size;\
939 }\
940 }\
941 \
942 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
943 {\
944 int i;\
945 const uint32_t a= LD32(pixels );\
946 const uint32_t b= LD32(pixels+1);\
947 uint32_t l0= (a&0x03030303UL)\
948 + (b&0x03030303UL)\
949 + 0x02020202UL;\
950 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
951 + ((b&0xFCFCFCFCUL)>>2);\
952 uint32_t l1,h1;\
953 \
954 pixels+=line_size;\
955 for(i=0; i<h; i+=2){\
956 uint32_t a= LD32(pixels );\
957 uint32_t b= LD32(pixels+1);\
958 l1= (a&0x03030303UL)\
959 + (b&0x03030303UL);\
960 h1= ((a&0xFCFCFCFCUL)>>2)\
961 + ((b&0xFCFCFCFCUL)>>2);\
962 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
963 pixels+=line_size;\
964 block +=line_size;\
965 a= LD32(pixels );\
966 b= LD32(pixels+1);\
967 l0= (a&0x03030303UL)\
968 + (b&0x03030303UL)\
969 + 0x02020202UL;\
970 h0= ((a&0xFCFCFCFCUL)>>2)\
971 + ((b&0xFCFCFCFCUL)>>2);\
972 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
973 pixels+=line_size;\
974 block +=line_size;\
975 }\
976 }\
977 \
978 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
979 {\
980 int j;\
981 for(j=0; j<2; j++){\
982 int i;\
983 const uint32_t a= LD32(pixels );\
984 const uint32_t b= LD32(pixels+1);\
985 uint32_t l0= (a&0x03030303UL)\
986 + (b&0x03030303UL)\
987 + 0x02020202UL;\
988 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
989 + ((b&0xFCFCFCFCUL)>>2);\
990 uint32_t l1,h1;\
991 \
992 pixels+=line_size;\
993 for(i=0; i<h; i+=2){\
994 uint32_t a= LD32(pixels );\
995 uint32_t b= LD32(pixels+1);\
996 l1= (a&0x03030303UL)\
997 + (b&0x03030303UL);\
998 h1= ((a&0xFCFCFCFCUL)>>2)\
999 + ((b&0xFCFCFCFCUL)>>2);\
1000 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1001 pixels+=line_size;\
1002 block +=line_size;\
1003 a= LD32(pixels );\
1004 b= LD32(pixels+1);\
1005 l0= (a&0x03030303UL)\
1006 + (b&0x03030303UL)\
1007 + 0x02020202UL;\
1008 h0= ((a&0xFCFCFCFCUL)>>2)\
1009 + ((b&0xFCFCFCFCUL)>>2);\
1010 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011 pixels+=line_size;\
1012 block +=line_size;\
1013 }\
1014 pixels+=4-line_size*(h+1);\
1015 block +=4-line_size*h;\
1016 }\
1017 }\
1018 \
1019 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1020 {\
1021 int j;\
1022 for(j=0; j<2; j++){\
1023 int i;\
1024 const uint32_t a= LD32(pixels );\
1025 const uint32_t b= LD32(pixels+1);\
1026 uint32_t l0= (a&0x03030303UL)\
1027 + (b&0x03030303UL)\
1028 + 0x01010101UL;\
1029 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1030 + ((b&0xFCFCFCFCUL)>>2);\
1031 uint32_t l1,h1;\
1032 \
1033 pixels+=line_size;\
1034 for(i=0; i<h; i+=2){\
1035 uint32_t a= LD32(pixels );\
1036 uint32_t b= LD32(pixels+1);\
1037 l1= (a&0x03030303UL)\
1038 + (b&0x03030303UL);\
1039 h1= ((a&0xFCFCFCFCUL)>>2)\
1040 + ((b&0xFCFCFCFCUL)>>2);\
1041 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1042 pixels+=line_size;\
1043 block +=line_size;\
1044 a= LD32(pixels );\
1045 b= LD32(pixels+1);\
1046 l0= (a&0x03030303UL)\
1047 + (b&0x03030303UL)\
1048 + 0x01010101UL;\
1049 h0= ((a&0xFCFCFCFCUL)>>2)\
1050 + ((b&0xFCFCFCFCUL)>>2);\
1051 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1052 pixels+=line_size;\
1053 block +=line_size;\
1054 }\
1055 pixels+=4-line_size*(h+1);\
1056 block +=4-line_size*h;\
1057 }\
1058 }\
1059 \
1060 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1061 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1062 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1063 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1064 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1065 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1066 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1067 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1068
1069 #define op_avg(a, b) a = rnd_avg32(a, b)
1070 #endif
1071 #define op_put(a, b) a = b
1072
1073 PIXOP2(avg, op_avg)
1074 PIXOP2(put, op_put)
1075 #undef op_avg
1076 #undef op_put
1077
1078 #define avg2(a,b) ((a+b+1)>>1)
1079 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1080
1081 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1082 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1083 }
1084
1085 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1086 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1087 }
1088
1089 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1090 {
1091 const int A=(16-x16)*(16-y16);
1092 const int B=( x16)*(16-y16);
1093 const int C=(16-x16)*( y16);
1094 const int D=( x16)*( y16);
1095 int i;
1096
1097 for(i=0; i<h; i++)
1098 {
1099 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1100 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1101 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1102 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1103 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1104 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1105 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1106 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1107 dst+= stride;
1108 src+= stride;
1109 }
1110 }
1111
1112 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1113 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1114 {
1115 int y, vx, vy;
1116 const int s= 1<<shift;
1117
1118 width--;
1119 height--;
1120
1121 for(y=0; y<h; y++){
1122 int x;
1123
1124 vx= ox;
1125 vy= oy;
1126 for(x=0; x<8; x++){ //XXX FIXME optimize
1127 int src_x, src_y, frac_x, frac_y, index;
1128
1129 src_x= vx>>16;
1130 src_y= vy>>16;
1131 frac_x= src_x&(s-1);
1132 frac_y= src_y&(s-1);
1133 src_x>>=shift;
1134 src_y>>=shift;
1135
1136 if((unsigned)src_x < width){
1137 if((unsigned)src_y < height){
1138 index= src_x + src_y*stride;
1139 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1140 + src[index +1]* frac_x )*(s-frac_y)
1141 + ( src[index+stride ]*(s-frac_x)
1142 + src[index+stride+1]* frac_x )* frac_y
1143 + r)>>(shift*2);
1144 }else{
1145 index= src_x + clip(src_y, 0, height)*stride;
1146 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1147 + src[index +1]* frac_x )*s
1148 + r)>>(shift*2);
1149 }
1150 }else{
1151 if((unsigned)src_y < height){
1152 index= clip(src_x, 0, width) + src_y*stride;
1153 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1154 + src[index+stride ]* frac_y )*s
1155 + r)>>(shift*2);
1156 }else{
1157 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1158 dst[y*stride + x]= src[index ];
1159 }
1160 }
1161
1162 vx+= dxx;
1163 vy+= dyx;
1164 }
1165 ox += dxy;
1166 oy += dyy;
1167 }
1168 }
1169
1170 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1171 switch(width){
1172 case 2: put_pixels2_c (dst, src, stride, height); break;
1173 case 4: put_pixels4_c (dst, src, stride, height); break;
1174 case 8: put_pixels8_c (dst, src, stride, height); break;
1175 case 16:put_pixels16_c(dst, src, stride, height); break;
1176 }
1177 }
1178
1179 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1180 int i,j;
1181 for (i=0; i < height; i++) {
1182 for (j=0; j < width; j++) {
1183 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1184 }
1185 src += stride;
1186 dst += stride;
1187 }
1188 }
1189
1190 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1191 int i,j;
1192 for (i=0; i < height; i++) {
1193 for (j=0; j < width; j++) {
1194 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1195 }
1196 src += stride;
1197 dst += stride;
1198 }
1199 }
1200
1201 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1202 int i,j;
1203 for (i=0; i < height; i++) {
1204 for (j=0; j < width; j++) {
1205 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1206 }
1207 src += stride;
1208 dst += stride;
1209 }
1210 }
1211
1212 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1213 int i,j;
1214 for (i=0; i < height; i++) {
1215 for (j=0; j < width; j++) {
1216 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1217 }
1218 src += stride;
1219 dst += stride;
1220 }
1221 }
1222
1223 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1224 int i,j;
1225 for (i=0; i < height; i++) {
1226 for (j=0; j < width; j++) {
1227 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1228 }
1229 src += stride;
1230 dst += stride;
1231 }
1232 }
1233
1234 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1235 int i,j;
1236 for (i=0; i < height; i++) {
1237 for (j=0; j < width; j++) {
1238 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1239 }
1240 src += stride;
1241 dst += stride;
1242 }
1243 }
1244
1245 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1246 int i,j;
1247 for (i=0; i < height; i++) {
1248 for (j=0; j < width; j++) {
1249 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1250 }
1251 src += stride;
1252 dst += stride;
1253 }
1254 }
1255
1256 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1257 int i,j;
1258 for (i=0; i < height; i++) {
1259 for (j=0; j < width; j++) {
1260 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1261 }
1262 src += stride;
1263 dst += stride;
1264 }
1265 }
1266
1267 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1268 switch(width){
1269 case 2: avg_pixels2_c (dst, src, stride, height); break;
1270 case 4: avg_pixels4_c (dst, src, stride, height); break;
1271 case 8: avg_pixels8_c (dst, src, stride, height); break;
1272 case 16:avg_pixels16_c(dst, src, stride, height); break;
1273 }
1274 }
1275
1276 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1277 int i,j;
1278 for (i=0; i < height; i++) {
1279 for (j=0; j < width; j++) {
1280 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1281 }
1282 src += stride;
1283 dst += stride;
1284 }
1285 }
1286
1287 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1288 int i,j;
1289 for (i=0; i < height; i++) {
1290 for (j=0; j < width; j++) {
1291 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1292 }
1293 src += stride;
1294 dst += stride;
1295 }
1296 }
1297
1298 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1299 int i,j;
1300 for (i=0; i < height; i++) {
1301 for (j=0; j < width; j++) {
1302 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1303 }
1304 src += stride;
1305 dst += stride;
1306 }
1307 }
1308
1309 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1310 int i,j;
1311 for (i=0; i < height; i++) {
1312 for (j=0; j < width; j++) {
1313 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1314 }
1315 src += stride;
1316 dst += stride;
1317 }
1318 }
1319
1320 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1321 int i,j;
1322 for (i=0; i < height; i++) {
1323 for (j=0; j < width; j++) {
1324 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1325 }
1326 src += stride;
1327 dst += stride;
1328 }
1329 }
1330
1331 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1332 int i,j;
1333 for (i=0; i < height; i++) {
1334 for (j=0; j < width; j++) {
1335 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1336 }
1337 src += stride;
1338 dst += stride;
1339 }
1340 }
1341
1342 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1343 int i,j;
1344 for (i=0; i < height; i++) {
1345 for (j=0; j < width; j++) {
1346 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1347 }
1348 src += stride;
1349 dst += stride;
1350 }
1351 }
1352
1353 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1354 int i,j;
1355 for (i=0; i < height; i++) {
1356 for (j=0; j < width; j++) {
1357 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1358 }
1359 src += stride;
1360 dst += stride;
1361 }
1362 }
1363 #if 0
1364 #define TPEL_WIDTH(width)\
1365 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1366 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1367 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1368 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1369 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1370 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1371 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1372 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1373 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1374 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1375 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1376 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1377 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1378 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1379 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1380 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1381 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1382 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1383 #endif
1384
1385 #define H264_CHROMA_MC(OPNAME, OP)\
1386 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1387 const int A=(8-x)*(8-y);\
1388 const int B=( x)*(8-y);\
1389 const int C=(8-x)*( y);\
1390 const int D=( x)*( y);\
1391 int i;\
1392 \
1393 assert(x<8 && y<8 && x>=0 && y>=0);\
1394 \
1395 for(i=0; i<h; i++)\
1396 {\
1397 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1398 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1399 dst+= stride;\
1400 src+= stride;\
1401 }\
1402 }\
1403 \
1404 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1405 const int A=(8-x)*(8-y);\
1406 const int B=( x)*(8-y);\
1407 const int C=(8-x)*( y);\
1408 const int D=( x)*( y);\
1409 int i;\
1410 \
1411 assert(x<8 && y<8 && x>=0 && y>=0);\
1412 \
1413 for(i=0; i<h; i++)\
1414 {\
1415 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1416 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1417 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1418 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1419 dst+= stride;\
1420 src+= stride;\
1421 }\
1422 }\
1423 \
1424 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1425 const int A=(8-x)*(8-y);\
1426 const int B=( x)*(8-y);\
1427 const int C=(8-x)*( y);\
1428 const int D=( x)*( y);\
1429 int i;\
1430 \
1431 assert(x<8 && y<8 && x>=0 && y>=0);\
1432 \
1433 for(i=0; i<h; i++)\
1434 {\
1435 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1436 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1437 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1438 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1439 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1440 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1441 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1442 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1443 dst+= stride;\
1444 src+= stride;\
1445 }\
1446 }
1447
1448 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1449 #define op_put(a, b) a = (((b) + 32)>>6)
1450
1451 H264_CHROMA_MC(put_ , op_put)
1452 H264_CHROMA_MC(avg_ , op_avg)
1453 #undef op_avg
1454 #undef op_put
1455
1456 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1457 {
1458 int i;
1459 for(i=0; i<h; i++)
1460 {
1461 ST32(dst , LD32(src ));
1462 dst+=dstStride;
1463 src+=srcStride;
1464 }
1465 }
1466
1467 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1468 {
1469 int i;
1470 for(i=0; i<h; i++)
1471 {
1472 ST32(dst , LD32(src ));
1473 ST32(dst+4 , LD32(src+4 ));
1474 dst+=dstStride;
1475 src+=srcStride;
1476 }
1477 }
1478
1479 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1480 {
1481 int i;
1482 for(i=0; i<h; i++)
1483 {
1484 ST32(dst , LD32(src ));
1485 ST32(dst+4 , LD32(src+4 ));
1486 ST32(dst+8 , LD32(src+8 ));
1487 ST32(dst+12, LD32(src+12));
1488 dst+=dstStride;
1489 src+=srcStride;
1490 }
1491 }
1492
1493 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1494 {
1495 int i;
1496 for(i=0; i<h; i++)
1497 {
1498 ST32(dst , LD32(src ));
1499 ST32(dst+4 , LD32(src+4 ));
1500 ST32(dst+8 , LD32(src+8 ));
1501 ST32(dst+12, LD32(src+12));
1502 dst[16]= src[16];
1503 dst+=dstStride;
1504 src+=srcStride;
1505 }
1506 }
1507
1508 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1509 {
1510 int i;
1511 for(i=0; i<h; i++)
1512 {
1513 ST32(dst , LD32(src ));
1514 ST32(dst+4 , LD32(src+4 ));
1515 dst[8]= src[8];
1516 dst+=dstStride;
1517 src+=srcStride;
1518 }
1519 }
1520
1521
1522 #define QPEL_MC(r, OPNAME, RND, OP) \
1523 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1524 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1525 int i;\
1526 for(i=0; i<h; i++)\
1527 {\
1528 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1529 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1530 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1531 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1532 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1533 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1534 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1535 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1536 dst+=dstStride;\
1537 src+=srcStride;\
1538 }\
1539 }\
1540 \
1541 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1542 const int w=8;\
1543 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1544 int i;\
1545 for(i=0; i<w; i++)\
1546 {\
1547 const int src0= src[0*srcStride];\
1548 const int src1= src[1*srcStride];\
1549 const int src2= src[2*srcStride];\
1550 const int src3= src[3*srcStride];\
1551 const int src4= src[4*srcStride];\
1552 const int src5= src[5*srcStride];\
1553 const int src6= src[6*srcStride];\
1554 const int src7= src[7*srcStride];\
1555 const int src8= src[8*srcStride];\
1556 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1557 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1558 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1559 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1560 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1561 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1562 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1563 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1564 dst++;\
1565 src++;\
1566 }\
1567 }\
1568 \
1569 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1570 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1571 int i;\
1572 \
1573 for(i=0; i<h; i++)\
1574 {\
1575 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1576 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1577 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1578 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1579 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1580 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1581 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1582 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1583 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1584 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1585 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1586 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1587 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1588 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1589 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1590 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1591 dst+=dstStride;\
1592 src+=srcStride;\
1593 }\
1594 }\
1595 \
1596 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1597 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1598 int i;\
1599 const int w=16;\
1600 for(i=0; i<w; i++)\
1601 {\
1602 const int src0= src[0*srcStride];\
1603 const int src1= src[1*srcStride];\
1604 const int src2= src[2*srcStride];\
1605 const int src3= src[3*srcStride];\
1606 const int src4= src[4*srcStride];\
1607 const int src5= src[5*srcStride];\
1608 const int src6= src[6*srcStride];\
1609 const int src7= src[7*srcStride];\
1610 const int src8= src[8*srcStride];\
1611 const int src9= src[9*srcStride];\
1612 const int src10= src[10*srcStride];\
1613 const int src11= src[11*srcStride];\
1614 const int src12= src[12*srcStride];\
1615 const int src13= src[13*srcStride];\
1616 const int src14= src[14*srcStride];\
1617 const int src15= src[15*srcStride];\
1618 const int src16= src[16*srcStride];\
1619 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1620 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1621 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1622 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1623 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1624 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1625 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1626 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1627 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1628 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1629 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1630 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1631 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1632 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1633 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1634 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1635 dst++;\
1636 src++;\
1637 }\
1638 }\
1639 \
1640 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1641 OPNAME ## pixels8_c(dst, src, stride, 8);\
1642 }\
1643 \
1644 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1645 uint8_t half[64];\
1646 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1647 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1648 }\
1649 \
1650 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1651 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1652 }\
1653 \
1654 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1655 uint8_t half[64];\
1656 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1657 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1658 }\
1659 \
1660 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1661 uint8_t full[16*9];\
1662 uint8_t half[64];\
1663 copy_block9(full, src, 16, stride, 9);\
1664 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1665 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1666 }\
1667 \
1668 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1669 uint8_t full[16*9];\
1670 copy_block9(full, src, 16, stride, 9);\
1671 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1672 }\
1673 \
1674 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1675 uint8_t full[16*9];\
1676 uint8_t half[64];\
1677 copy_block9(full, src, 16, stride, 9);\
1678 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1679 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1680 }\
1681 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1682 uint8_t full[16*9];\
1683 uint8_t halfH[72];\
1684 uint8_t halfV[64];\
1685 uint8_t halfHV[64];\
1686 copy_block9(full, src, 16, stride, 9);\
1687 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1688 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1689 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1690 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1691 }\
1692 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1693 uint8_t full[16*9];\
1694 uint8_t halfH[72];\
1695 uint8_t halfHV[64];\
1696 copy_block9(full, src, 16, stride, 9);\
1697 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1698 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1699 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1700 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1701 }\
1702 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1703 uint8_t full[16*9];\
1704 uint8_t halfH[72];\
1705 uint8_t halfV[64];\
1706 uint8_t halfHV[64];\
1707 copy_block9(full, src, 16, stride, 9);\
1708 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1709 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1710 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1711 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1712 }\
1713 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1714 uint8_t full[16*9];\
1715 uint8_t halfH[72];\
1716 uint8_t halfHV[64];\
1717 copy_block9(full, src, 16, stride, 9);\
1718 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1719 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1720 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1721 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1722 }\
1723 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1724 uint8_t full[16*9];\
1725 uint8_t halfH[72];\
1726 uint8_t halfV[64];\
1727 uint8_t halfHV[64];\
1728 copy_block9(full, src, 16, stride, 9);\
1729 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1730 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1732 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1733 }\
1734 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1735 uint8_t full[16*9];\
1736 uint8_t halfH[72];\
1737 uint8_t halfHV[64];\
1738 copy_block9(full, src, 16, stride, 9);\
1739 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1743 }\
1744 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1745 uint8_t full[16*9];\
1746 uint8_t halfH[72];\
1747 uint8_t halfV[64];\
1748 uint8_t halfHV[64];\
1749 copy_block9(full, src, 16, stride, 9);\
1750 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1751 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1753 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1754 }\
1755 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1756 uint8_t full[16*9];\
1757 uint8_t halfH[72];\
1758 uint8_t halfHV[64];\
1759 copy_block9(full, src, 16, stride, 9);\
1760 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1764 }\
1765 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1766 uint8_t halfH[72];\
1767 uint8_t halfHV[64];\
1768 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1769 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1770 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1771 }\
1772 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1773 uint8_t halfH[72];\
1774 uint8_t halfHV[64];\
1775 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1776 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1778 }\
1779 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1780 uint8_t full[16*9];\
1781 uint8_t halfH[72];\
1782 uint8_t halfV[64];\
1783 uint8_t halfHV[64];\
1784 copy_block9(full, src, 16, stride, 9);\
1785 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1787 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1789 }\
1790 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1791 uint8_t full[16*9];\
1792 uint8_t halfH[72];\
1793 copy_block9(full, src, 16, stride, 9);\
1794 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1795 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1796 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1797 }\
1798 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1799 uint8_t full[16*9];\
1800 uint8_t halfH[72];\
1801 uint8_t halfV[64];\
1802 uint8_t halfHV[64];\
1803 copy_block9(full, src, 16, stride, 9);\
1804 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1805 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1806 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1807 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1808 }\
1809 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1810 uint8_t full[16*9];\
1811 uint8_t halfH[72];\
1812 copy_block9(full, src, 16, stride, 9);\
1813 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1814 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1815 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1816 }\
1817 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1818 uint8_t halfH[72];\
1819 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1820 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1821 }\
1822 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1823 OPNAME ## pixels16_c(dst, src, stride, 16);\
1824 }\
1825 \
1826 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1827 uint8_t half[256];\
1828 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1829 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1830 }\
1831 \
1832 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1833 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1834 }\
1835 \
1836 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t half[256];\
1838 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1839 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1840 }\
1841 \
1842 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1843 uint8_t full[24*17];\
1844 uint8_t half[256];\
1845 copy_block17(full, src, 24, stride, 17);\
1846 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1847 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1848 }\
1849 \
1850 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1851 uint8_t full[24*17];\
1852 copy_block17(full, src, 24, stride, 17);\
1853 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1854 }\
1855 \
1856 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1857 uint8_t full[24*17];\
1858 uint8_t half[256];\
1859 copy_block17(full, src, 24, stride, 17);\
1860 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1861 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1862 }\
1863 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1864 uint8_t full[24*17];\
1865 uint8_t halfH[272];\
1866 uint8_t halfV[256];\
1867 uint8_t halfHV[256];\
1868 copy_block17(full, src, 24, stride, 17);\
1869 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1870 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1871 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1872 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1873 }\
1874 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1875 uint8_t full[24*17];\
1876 uint8_t halfH[272];\
1877 uint8_t halfHV[256];\
1878 copy_block17(full, src, 24, stride, 17);\
1879 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1880 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1881 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1882 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1883 }\
1884 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1885 uint8_t full[24*17];\
1886 uint8_t halfH[272];\
1887 uint8_t halfV[256];\
1888 uint8_t halfHV[256];\
1889 copy_block17(full, src, 24, stride, 17);\
1890 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1891 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1892 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1893 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1894 }\
1895 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1896 uint8_t full[24*17];\
1897 uint8_t halfH[272];\
1898 uint8_t halfHV[256];\
1899 copy_block17(full, src, 24, stride, 17);\
1900 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1901 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1902 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1903 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1904 }\
1905 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1906 uint8_t full[24*17];\
1907 uint8_t halfH[272];\
1908 uint8_t halfV[256];\
1909 uint8_t halfHV[256];\
1910 copy_block17(full, src, 24, stride, 17);\
1911 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1912 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1914 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1915 }\
1916 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1917 uint8_t full[24*17];\
1918 uint8_t halfH[272];\
1919 uint8_t halfHV[256];\
1920 copy_block17(full, src, 24, stride, 17);\
1921 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1925 }\
1926 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1927 uint8_t full[24*17];\
1928 uint8_t halfH[272];\
1929 uint8_t halfV[256];\
1930 uint8_t halfHV[256];\
1931 copy_block17(full, src, 24, stride, 17);\
1932 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1933 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1935 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1936 }\
1937 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1938 uint8_t full[24*17];\
1939 uint8_t halfH[272];\
1940 uint8_t halfHV[256];\
1941 copy_block17(full, src, 24, stride, 17);\
1942 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1946 }\
1947 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1948 uint8_t halfH[272];\
1949 uint8_t halfHV[256];\
1950 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1951 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1952 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1953 }\
1954 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1955 uint8_t halfH[272];\
1956 uint8_t halfHV[256];\
1957 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1958 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1960 }\
1961 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1962 uint8_t full[24*17];\
1963 uint8_t halfH[272];\
1964 uint8_t halfV[256];\
1965 uint8_t halfHV[256];\
1966 copy_block17(full, src, 24, stride, 17);\
1967 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1969 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1971 }\
1972 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1973 uint8_t full[24*17];\
1974 uint8_t halfH[272];\
1975 copy_block17(full, src, 24, stride, 17);\
1976 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1977 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1978 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1979 }\
1980 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1981 uint8_t full[24*17];\
1982 uint8_t halfH[272];\
1983 uint8_t halfV[256];\
1984 uint8_t halfHV[256];\
1985 copy_block17(full, src, 24, stride, 17);\
1986 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1987 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1988 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1989 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1990 }\
1991 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1992 uint8_t full[24*17];\
1993 uint8_t halfH[272];\
1994 copy_block17(full, src, 24, stride, 17);\
1995 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1996 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1997 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1998 }\
1999 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2000 uint8_t halfH[272];\
2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2002 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2003 }
2004
2005 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2006 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2007 #define op_put(a, b) a = cm[((b) + 16)>>5]
2008 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2009
2010 QPEL_MC(0, put_ , _ , op_put)
2011 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2012 QPEL_MC(0, avg_ , _ , op_avg)
2013 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2014 #undef op_avg
2015 #undef op_avg_no_rnd
2016 #undef op_put
2017 #undef op_put_no_rnd
2018
2019 #if 1
2020 #define H264_LOWPASS(OPNAME, OP, OP2) \
2021 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2022 const int h=4;\
2023 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2024 int i;\
2025 for(i=0; i<h; i++)\
2026 {\
2027 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2028 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2029 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2030 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2031 dst+=dstStride;\
2032 src+=srcStride;\
2033 }\
2034 }\
2035 \
2036 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2037 const int w=4;\
2038 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2039 int i;\
2040 for(i=0; i<w; i++)\
2041 {\
2042 const int srcB= src[-2*srcStride];\
2043 const int srcA= src[-1*srcStride];\
2044 const int src0= src[0 *srcStride];\
2045 const int src1= src[1 *srcStride];\
2046 const int src2= src[2 *srcStride];\
2047 const int src3= src[3 *srcStride];\
2048 const int src4= src[4 *srcStride];\
2049 const int src5= src[5 *srcStride];\
2050 const int src6= src[6 *srcStride];\
2051 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2052 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2053 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2054 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2055 dst++;\
2056 src++;\
2057 }\
2058 }\
2059 \
2060 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2061 const int h=4;\
2062 const int w=4;\
2063 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2064 int i;\
2065 src -= 2*srcStride;\
2066 for(i=0; i<h+5; i++)\
2067 {\
2068 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2069 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2070 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2071 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2072 tmp+=tmpStride;\
2073 src+=srcStride;\
2074 }\
2075 tmp -= tmpStride*(h+5-2);\
2076 for(i=0; i<w; i++)\
2077 {\
2078 const int tmpB= tmp[-2*tmpStride];\
2079 const int tmpA= tmp[-1*tmpStride];\
2080 const int tmp0= tmp[0 *tmpStride];\
2081 const int tmp1= tmp[1 *tmpStride];\
2082 const int tmp2= tmp[2 *tmpStride];\
2083 const int tmp3= tmp[3 *tmpStride];\
2084 const int tmp4= tmp[4 *tmpStride];\
2085 const int tmp5= tmp[5 *tmpStride];\
2086 const int tmp6= tmp[6 *tmpStride];\
2087 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2088 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2089 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2090 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2091 dst++;\
2092 tmp++;\
2093 }\
2094 }\
2095 \
2096 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2097 const int h=8;\
2098 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2099 int i;\
2100 for(i=0; i<h; i++)\
2101 {\
2102 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2103 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2104 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2105 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2106 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2107 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2108 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2109 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2110 dst+=dstStride;\
2111 src+=srcStride;\
2112 }\
2113 }\
2114 \
2115 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2116 const int w=8;\
2117 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2118 int i;\
2119 for(i=0; i<w; i++)\
2120 {\
2121 const int srcB= src[-2*srcStride];\
2122 const int srcA= src[-1*srcStride];\
2123 const int src0= src[0 *srcStride];\
2124 const int src1= src[1 *srcStride];\
2125 const int src2= src[2 *srcStride];\
2126 const int src3= src[3 *srcStride];\
2127 const int src4= src[4 *srcStride];\
2128 const int src5= src[5 *srcStride];\
2129 const int src6= src[6 *srcStride];\
2130 const int src7= src[7 *srcStride];\
2131 const int src8= src[8 *srcStride];\
2132 const int src9= src[9 *srcStride];\
2133 const int src10=src[10*srcStride];\
2134 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2135 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2136 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2137 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2138 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2139 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2140 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2141 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2142 dst++;\
2143 src++;\
2144 }\
2145 }\
2146 \
2147 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2148 const int h=8;\
2149 const int w=8;\
2150 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2151 int i;\
2152 src -= 2*srcStride;\
2153 for(i=0; i<h+5; i++)\
2154 {\
2155 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2156 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2157 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2158 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2159 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2160 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2161 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2162 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2163 tmp+=tmpStride;\
2164 src+=srcStride;\
2165 }\
2166 tmp -= tmpStride*(h+5-2);\
2167 for(i=0; i<w; i++)\
2168 {\
2169 const int tmpB= tmp[-2*tmpStride];\
2170 const int tmpA= tmp[-1*tmpStride];\
2171 const int tmp0= tmp[0 *tmpStride];\
2172 const int tmp1= tmp[1 *tmpStride];\
2173 const int tmp2= tmp[2 *tmpStride];\
2174 const int tmp3= tmp[3 *tmpStride];\
2175 const int tmp4= tmp[4 *tmpStride];\
2176 const int tmp5= tmp[5 *tmpStride];\
2177 const int tmp6= tmp[6 *tmpStride];\
2178 const int tmp7= tmp[7 *tmpStride];\
2179 const int tmp8= tmp[8 *tmpStride];\
2180 const int tmp9= tmp[9 *tmpStride];\
2181 const int tmp10=tmp[10*tmpStride];\
2182 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2183 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2184 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2185 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2186 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2187 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2188 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2189 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2190 dst++;\
2191 tmp++;\
2192 }\
2193 }\
2194 \
2195 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2196 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2197 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2198 src += 8*srcStride;\
2199 dst += 8*dstStride;\
2200 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2201 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2202 }\
2203 \
2204 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2206 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2207 src += 8*srcStride;\
2208 dst += 8*dstStride;\
2209 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2210 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2211 }\
2212 \
2213 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2214 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2215 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2216 src += 8*srcStride;\
2217 dst += 8*dstStride;\
2218 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2219 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2220 }\
2221
2222 #define H264_MC(OPNAME, SIZE) \
2223 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2224 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2225 }\
2226 \
2227 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2228 uint8_t half[SIZE*SIZE];\
2229 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2230 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2231 }\
2232 \
2233 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2234 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2235 }\
2236 \
2237 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2238 uint8_t half[SIZE*SIZE];\
2239 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2240 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2241 }\
2242 \
2243 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2244 uint8_t full[SIZE*(SIZE+5)];\
2245 uint8_t * const full_mid= full + SIZE*2;\
2246 uint8_t half[SIZE*SIZE];\
2247 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2248 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2249 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2250 }\
2251 \
2252 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2253 uint8_t full[SIZE*(SIZE+5)];\
2254 uint8_t * const full_mid= full + SIZE*2;\
2255 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2256 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2257 }\
2258 \
2259 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2260 uint8_t full[SIZE*(SIZE+5)];\
2261 uint8_t * const full_mid= full + SIZE*2;\
2262 uint8_t half[SIZE*SIZE];\
2263 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2264 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2265 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2266 }\
2267 \
2268 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2269 uint8_t full[SIZE*(SIZE+5)];\
2270 uint8_t * const full_mid= full + SIZE*2;\
2271 uint8_t halfH[SIZE*SIZE];\
2272 uint8_t halfV[SIZE*SIZE];\
2273 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2274 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2275 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2276 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2277 }\
2278 \
2279 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2280 uint8_t full[SIZE*(SIZE+5)];\
2281 uint8_t * const full_mid= full + SIZE*2;\
2282 uint8_t halfH[SIZE*SIZE];\
2283 uint8_t halfV[SIZE*SIZE];\
2284 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2285 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2286 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2287 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2288 }\
2289 \
2290 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2291 uint8_t full[SIZE*(SIZE+5)];\
2292 uint8_t * const full_mid= full + SIZE*2;\
2293 uint8_t halfH[SIZE*SIZE];\
2294 uint8_t halfV[SIZE*SIZE];\
2295 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2296 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2297 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2298 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2299 }\
2300 \
2301 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2302 uint8_t full[SIZE*(SIZE+5)];\
2303 uint8_t * const full_mid= full + SIZE*2;\
2304 uint8_t halfH[SIZE*SIZE];\
2305 uint8_t halfV[SIZE*SIZE];\
2306 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2307 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2308 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2309 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2310 }\
2311 \
2312 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2313 int16_t tmp[SIZE*(SIZE+5)];\
2314 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2315 }\
2316 \
2317 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2318 int16_t tmp[SIZE*(SIZE+5)];\
2319 uint8_t halfH[SIZE*SIZE];\
2320 uint8_t halfHV[SIZE*SIZE];\
2321 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2322 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2323 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2324 }\
2325 \
2326 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2327 int16_t tmp[SIZE*(SIZE+5)];\
2328 uint8_t halfH[SIZE*SIZE];\
2329 uint8_t halfHV[SIZE*SIZE];\
2330 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2331 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2332 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2333 }\
2334 \
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2336 uint8_t full[SIZE*(SIZE+5)];\
2337 uint8_t * const full_mid= full + SIZE*2;\
2338 int16_t tmp[SIZE*(SIZE+5)];\
2339 uint8_t halfV[SIZE*SIZE];\
2340 uint8_t halfHV[SIZE*SIZE];\
2341 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2342 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2344 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2345 }\
2346 \
2347 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2348 uint8_t full[SIZE*(SIZE+5)];\
2349 uint8_t * const full_mid= full + SIZE*2;\
2350 int16_t tmp[SIZE*(SIZE+5)];\
2351 uint8_t halfV[SIZE*SIZE];\
2352 uint8_t halfHV[SIZE*SIZE];\
2353 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2354 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2355 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2356 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2357 }\
2358
2359 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2360 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2361 #define op_put(a, b) a = cm[((b) + 16)>>5]
2362 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2363 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2364
2365 H264_LOWPASS(put_ , op_put, op2_put)
2366 H264_LOWPASS(avg_ , op_avg, op2_avg)
2367 H264_MC(put_, 4)
2368 H264_MC(put_, 8)
2369 H264_MC(put_, 16)
2370 H264_MC(avg_, 4)
2371 H264_MC(avg_, 8)
2372 H264_MC(avg_, 16)
2373
2374 #undef op_avg
2375 #undef op_put
2376 #undef op2_avg
2377 #undef op2_put
2378 #endif
2379
2380 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2381 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2382 #define H264_WEIGHT(W,H) \
2383 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2384 int attribute_unused x, y; \
2385 offset <<= log2_denom; \
2386 if(log2_denom) offset += 1<<(log2_denom-1); \
2387 for(y=0; y<H; y++, block += stride){ \
2388 op_scale1(0); \
2389 op_scale1(1); \
2390 if(W==2) continue; \
2391 op_scale1(2); \
2392 op_scale1(3); \
2393 if(W==4) continue; \
2394 op_scale1(4); \
2395 op_scale1(5); \
2396 op_scale1(6); \
2397 op_scale1(7); \
2398 if(W==8) continue; \
2399 op_scale1(8); \
2400 op_scale1(9); \
2401 op_scale1(10); \
2402 op_scale1(11); \
2403 op_scale1(12); \
2404 op_scale1(13); \
2405 op_scale1(14); \
2406 op_scale1(15); \
2407 } \
2408 } \
2409 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2410 int attribute_unused x, y; \
2411 int offset = (offsets + offsetd + 1) >> 1; \
2412 offset = ((offset << 1) + 1) << log2_denom; \
2413 for(y=0; y<H; y++, dst += stride, src += stride){ \
2414 op_scale2(0); \
2415 op_scale2(1); \
2416 if(W==2) continue; \
2417 op_scale2(2); \
2418 op_scale2(3); \
2419 if(W==4) continue; \
2420 op_scale2(4); \
2421 op_scale2(5); \
2422 op_scale2(6); \
2423 op_scale2(7); \
2424 if(W==8) continue; \
2425 op_scale2(8); \
2426 op_scale2(9); \
2427 op_scale2(10); \
2428 op_scale2(11); \
2429 op_scale2(12); \
2430 op_scale2(13); \
2431 op_scale2(14); \
2432 op_scale2(15); \
2433 } \
2434 }
2435
2436 H264_WEIGHT(16,16)
2437 H264_WEIGHT(16,8)
2438 H264_WEIGHT(8,16)
2439 H264_WEIGHT(8,8)
2440 H264_WEIGHT(8,4)
2441 H264_WEIGHT(4,8)
2442 H264_WEIGHT(4,4)
2443 H264_WEIGHT(4,2)
2444 H264_WEIGHT(2,4)
2445 H264_WEIGHT(2,2)
2446
2447 #undef op_scale1
2448 #undef op_scale2
2449 #undef H264_WEIGHT
2450
2451 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2452 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2453 int i;
2454
2455 for(i=0; i<h; i++){
2456 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2457 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2458 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2459 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2460 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2461 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2462 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2463 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2464 dst+=dstStride;
2465 src+=srcStride;
2466 }
2467 }
2468
2469 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2470 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2471 int i;
2472
2473 for(i=0; i<w; i++){
2474 const int src_1= src[ -srcStride];
2475 const int src0 = src[0 ];
2476 const int src1 = src[ srcStride];
2477 const int src2 = src[2*srcStride];
2478 const int src3 = src[3*srcStride];
2479 const int src4 = src[4*srcStride];
2480 const int src5 = src[5*srcStride];
2481 const int src6 = src[6*srcStride];
2482 const int src7 = src[7*srcStride];
2483 const int src8 = src[8*srcStride];
2484 const int src9 = src[9*srcStride];
2485 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2486 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2487 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2488 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2489 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2490 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2491 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2492 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2493 src++;
2494 dst++;
2495 }
2496 }
2497
2498 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2499 put_pixels8_c(dst, src, stride, 8);
2500 }
2501
2502 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2503 uint8_t half[64];
2504 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2505 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2506 }
2507
2508 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2509 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2510 }
2511
2512 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2513 uint8_t half[64];
2514 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2515 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2516 }
2517
2518 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2519 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2520 }
2521
2522 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2523 uint8_t halfH[88];
2524 uint8_t halfV[64];
2525 uint8_t halfHV[64];
2526 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2527 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2528 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2529 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2530 }
2531 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2532 uint8_t halfH[88];
2533 uint8_t halfV[64];
2534 uint8_t halfHV[64];
2535 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2536 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2537 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2538 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2539 }
2540 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2541 uint8_t halfH[88];
2542 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2543 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2544 }
2545
2546 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2547 int x;
2548 const int strength= ff_h263_loop_filter_strength[qscale];
2549
2550 for(x=0; x<8; x++){
2551 int d1, d2, ad1;
2552 int p0= src[x-2*stride];
2553 int p1= src[x-1*stride];
2554 int p2= src[x+0*stride];
2555 int p3= src[x+1*stride];
2556 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2557
2558 if (d<-2*strength) d1= 0;
2559 else if(d<- strength) d1=-2*strength - d;
2560 else if(d< strength) d1= d;
2561 else if(d< 2*strength) d1= 2*strength - d;
2562 else d1= 0;
2563
2564 p1 += d1;
2565 p2 -= d1;
2566 if(p1&256) p1= ~(p1>>31);
2567 if(p2&256) p2= ~(p2>>31);
2568
2569 src[x-1*stride] = p1;
2570 src[x+0*stride] = p2;
2571
2572 ad1= ABS(d1)>>1;
2573
2574 d2= clip((p0-p3)/4, -ad1, ad1);
2575
2576 src[x-2*stride] = p0 - d2;
2577 src[x+ stride] = p3 + d2;
2578 }
2579 }
2580
2581 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2582 int y;
2583 const int strength= ff_h263_loop_filter_strength[qscale];
2584
2585 for(y=0; y<8; y++){
2586 int d1, d2, ad1;
2587 int p0= src[y*stride-2];
2588 int p1= src[y*stride-1];
2589 int p2= src[y*stride+0];
2590 int p3= src[y*stride+1];
2591 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2592
2593 if (d<-2*strength) d1= 0;
2594 else if(d<- strength) d1=-2*strength - d;
2595 else if(d< strength) d1= d;
2596 else if(d< 2*strength) d1= 2*strength - d;
2597 else d1= 0;
2598
2599 p1 += d1;
2600 p2 -= d1;
2601 if(p1&256) p1= ~(p1>>31);
2602 if(p2&256) p2= ~(p2>>31);
2603
2604 src[y*stride-1] = p1;
2605 src[y*stride+0] = p2;
2606
2607 ad1= ABS(d1)>>1;
2608
2609 d2= clip((p0-p3)/4, -ad1, ad1);
2610
2611 src[y*stride-2] = p0 - d2;
2612 src[y*stride+1] = p3 + d2;
2613 }
2614 }
2615
2616 static void h261_loop_filter_c(uint8_t *src, int stride){
2617 int x,y,xy,yz;
2618 int temp[64];
2619
2620 for(x=0; x<8; x++){
2621 temp[x ] = 4*src[x ];
2622 temp[x + 7*8] = 4*src[x + 7*stride];
2623 }
2624 for(y=1; y<7; y++){
2625 for(x=0; x<8; x++){
2626 xy = y * stride + x;
2627 yz = y * 8 + x;
2628 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2629 }
2630 }
2631
2632 for(y=0; y<8; y++){
2633 src[ y*stride] = (temp[ y*8] + 2)>>2;
2634 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2635 for(x=1; x<7; x++){
2636 xy = y * stride + x;
2637 yz = y * 8 + x;
2638 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2639 }
2640 }
2641 }
2642
2643 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int *tc0)
2644 {
2645 int i, d;
2646 for( i = 0; i < 4; i++ ) {
2647 if( tc0[i] < 0 ) {
2648 pix += 4*ystride;
2649 continue;
2650 }
2651 for( d = 0; d < 4; d++ ) {
2652 const int p0 = pix[-1*xstride];
2653 const int p1 = pix[-2*xstride];
2654 const int p2 = pix[-3*xstride];
2655 const int q0 = pix[0];
2656 const int q1 = pix[1*xstride];
2657 const int q2 = pix[2*xstride];
2658
2659 if( ABS( p0 - q0 ) < alpha &&
2660 ABS( p1 - p0 ) < beta &&
2661 ABS( q1 - q0 ) < beta ) {
2662
2663 int tc = tc0[i];
2664 int i_delta;
2665
2666 if( ABS( p2 - p0 ) < beta ) {
2667 pix[-2*xstride] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0[i], tc0[i] );
2668 tc++;
2669 }
2670 if( ABS( q2 - q0 ) < beta ) {
2671 pix[xstride] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0[i], tc0[i] );
2672 tc++;
2673 }
2674
2675 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2676 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2677 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2678 }
2679 pix += ystride;
2680 }
2681 }
2682 }
2683 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
2684 {
2685 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2686 }
2687 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
2688 {
2689 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2690 }
2691
2692 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int *tc0)
2693 {
2694 int i, d;
2695 for( i = 0; i < 4; i++ ) {
2696 const int tc = tc0[i];
2697 if( tc <= 0 ) {
2698 pix += 2*ystride;
2699 continue;
2700 }
2701 for( d = 0; d < 2; d++ ) {
2702 const int p0 = pix[-1*xstride];
2703 const int p1 = pix[-2*xstride];
2704 const int q0 = pix[0];
2705 const int q1 = pix[1*xstride];
2706
2707 if( ABS( p0 - q0 ) < alpha &&
2708 ABS( p1 - p0 ) < beta &&
2709 ABS( q1 - q0 ) < beta ) {
2710
2711 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2712
2713 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2714 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2715 }
2716 pix += ystride;
2717 }
2718 }
2719 }
2720 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
2721 {
2722 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2723 }
2724 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
2725 {
2726 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2727 }
2728
2729 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2730 {
2731 int s, i;
2732
2733 s = 0;
2734 for(i=0;i<h;i++) {
2735 s += abs(pix1[0] - pix2[0]);
2736 s += abs(pix1[1] - pix2[1]);
2737 s += abs(pix1[2] - pix2[2]);
2738 s += abs(pix1[3] - pix2[3]);
2739 s += abs(pix1[4] - pix2[4]);
2740 s += abs(pix1[5] - pix2[5]);
2741 s += abs(pix1[6] - pix2[6]);
2742 s += abs(pix1[7] - pix2[7]);
2743 s += abs(pix1[8] - pix2[8]);
2744 s += abs(pix1[9] - pix2[9]);
2745 s += abs(pix1[10] - pix2[10]);
2746 s += abs(pix1[11] - pix2[11]);
2747 s += abs(pix1[12] - pix2[12]);
2748 s += abs(pix1[13] - pix2[13]);
2749 s += abs(pix1[14] - pix2[14]);
2750 s += abs(pix1[15] - pix2[15]);
2751 pix1 += line_size;
2752 pix2 += line_size;
2753 }
2754 return s;
2755 }
2756
2757 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2758 {
2759 int s, i;
2760
2761 s = 0;
2762 for(i=0;i<h;i++) {
2763 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2764 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2765 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2766 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2767 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2768 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2769 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2770 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2771 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2772 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2773 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2774 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2775 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2776 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2777 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2778 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2779 pix1 += line_size;
2780 pix2 += line_size;
2781 }
2782 return s;
2783 }
2784
2785 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2786 {
2787 int s, i;
2788 uint8_t *pix3 = pix2 + line_size;
2789
2790 s = 0;
2791 for(i=0;i<h;i++) {
2792 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2793 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2794 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2795 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2796 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2797 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2798 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2799 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2800 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2801 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2802 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2803 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2804 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2805 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2806 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2807 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2808 pix1 += line_size;
2809 pix2 += line_size;
2810 pix3 += line_size;
2811 }
2812 return s;
2813 }
2814
2815 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2816 {
2817 int s, i;
2818 uint8_t *pix3 = pix2 + line_size;
2819
2820 s = 0;
2821 for(i=0;i<h;i++) {
2822 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2823 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2824 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2825 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2826 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2827 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2828 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2829 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2830 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2831 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2832 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2833 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2834 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2835 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2836 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2837 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2838 pix1 += line_size;
2839 pix2 += line_size;
2840 pix3 += line_size;
2841 }
2842 return s;
2843 }
2844
2845 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2846 {
2847 int s, i;
2848
2849 s = 0;
2850 for(i=0;i<h;i++) {
2851 s += abs(pix1[0] - pix2[0]);
2852 s += abs(pix1[1] - pix2[1]);
2853 s += abs(pix1[2] - pix2[2]);
2854 s += abs(pix1[3] - pix2[3]);
2855 s += abs(pix1[4] - pix2[4]);
2856 s += abs(pix1[5] - pix2[5]);
2857 s += abs(pix1[6] - pix2[6]);
2858 s += abs(pix1[7] - pix2[7]);
2859 pix1 += line_size;
2860 pix2 += line_size;
2861 }
2862 return s;
2863 }
2864
2865 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2866 {
2867 int s, i;
2868
2869 s = 0;
2870 for(i=0;i<h;i++) {
2871 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2872 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2873 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2874 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2875 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2876 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2877 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2878 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2879 pix1 += line_size;
2880 pix2 += line_size;
2881 }
2882 return s;
2883 }
2884
2885 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2886 {
2887 int s, i;
2888 uint8_t *pix3 = pix2 + line_size;
2889
2890 s = 0;
2891 for(i=0;i<h;i++) {
2892 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2893 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2894 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2895 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2896 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2897 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2898 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2899 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2900 pix1 += line_size;
2901 pix2 += line_size;
2902 pix3 += line_size;
2903 }
2904 return s;
2905 }
2906
2907 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2908 {
2909 int s, i;
2910 uint8_t *pix3 = pix2 + line_size;
2911
2912 s = 0;
2913 for(i=0;i<h;i++) {
2914 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2915 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2916 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2917 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2918 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2919 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2920 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2921 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2922 pix1 += line_size;
2923 pix2 += line_size;
2924 pix3 += line_size;
2925 }
2926 return s;
2927 }
2928
2929 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2930 int score1=0;
2931 int score2=0;
2932 int x,y;
2933
2934 for(y=0; y<h; y++){
2935 for(x=0; x<16; x++){
2936 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2937 }
2938 if(y+1<h){
2939 for(x=0; x<15; x++){
2940 score2+= ABS( s1[x ] - s1[x +stride]
2941 - s1[x+1] + s1[x+1+stride])
2942 -ABS( s2[x ] - s2[x +stride]
2943 - s2[x+1] + s2[x+1+stride]);
2944 }
2945 }
2946 s1+= stride;
2947 s2+= stride;
2948 }
2949
2950 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2951 else return score1 + ABS(score2)*8;
2952 }
2953
2954 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2955 int score1=0;
2956 int score2=0;
2957 int x,y;
2958
2959 for(y=0; y<h; y++){
2960 for(x=0; x<8; x++){
2961 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2962 }
2963 if(y+1<h){
2964 for(x=0; x<7; x++){
2965 score2+= ABS( s1[x ] - s1[x +stride]
2966 - s1[x+1] + s1[x+1+stride])
2967 -ABS( s2[x ] - s2[x +stride]
2968 - s2[x+1] + s2[x+1+stride]);
2969 }
2970 }
2971 s1+= stride;
2972 s2+= stride;
2973 }
2974
2975 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2976 else return score1 + ABS(score2)*8;
2977 }
2978
2979 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2980 int i;
2981 unsigned int sum=0;
2982
2983 for(i=0; i<8*8; i++){
2984 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2985 int w= weight[i];
2986 b>>= RECON_SHIFT;
2987 assert(-512<b && b<512);
2988
2989 sum += (w*b)*(w*b)>>4;
2990 }
2991 return sum>>2;
2992 }
2993
2994 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2995 int i;
2996
2997 for(i=0; i<8*8; i++){
2998 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2999 }
3000 }
3001
3002 /**
3003 * permutes an 8x8 block.
3004 * @param block the block which will be permuted according to the given permutation vector
3005 * @param permutation the permutation vector
3006 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3007 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3008 * (inverse) permutated to scantable order!
3009 */
3010 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3011 {
3012 int i;
3013 DCTELEM temp[64];
3014
3015 if(last<=0) return;
3016 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3017
3018 for(i=0; i<=last; i++){
3019 const int j= scantable[i];
3020 temp[j]= block[j];
3021 block[j]=0;
3022 }
3023
3024 for(i=0; i<=last; i++){
3025 const int j= scantable[i];
3026 const int perm_j= permutation[j];
3027 block[perm_j]= temp[j];
3028 }
3029 }
3030
3031 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3032 return 0;
3033 }
3034
3035 void ff_set_cmp(DSPContext* c, me_cmp_f