16daa01e4508218fbde56a7f0715c5ba402a3cff
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
32 #include "faandct.h"
33
34 /* snow.c */
35 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
36
37 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
38 uint32_t squareTbl[512] = {0, };
39
40 const uint8_t ff_zigzag_direct[64] = {
41 0, 1, 8, 16, 9, 2, 3, 10,
42 17, 24, 32, 25, 18, 11, 4, 5,
43 12, 19, 26, 33, 40, 48, 41, 34,
44 27, 20, 13, 6, 7, 14, 21, 28,
45 35, 42, 49, 56, 57, 50, 43, 36,
46 29, 22, 15, 23, 30, 37, 44, 51,
47 58, 59, 52, 45, 38, 31, 39, 46,
48 53, 60, 61, 54, 47, 55, 62, 63
49 };
50
51 /* Specific zigzag scan for 248 idct. NOTE that unlike the
52 specification, we interleave the fields */
53 const uint8_t ff_zigzag248_direct[64] = {
54 0, 8, 1, 9, 16, 24, 2, 10,
55 17, 25, 32, 40, 48, 56, 33, 41,
56 18, 26, 3, 11, 4, 12, 19, 27,
57 34, 42, 49, 57, 50, 58, 35, 43,
58 20, 28, 5, 13, 6, 14, 21, 29,
59 36, 44, 51, 59, 52, 60, 37, 45,
60 22, 30, 7, 15, 23, 31, 38, 46,
61 53, 61, 54, 62, 39, 47, 55, 63,
62 };
63
64 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
65 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
66
67 const uint8_t ff_alternate_horizontal_scan[64] = {
68 0, 1, 2, 3, 8, 9, 16, 17,
69 10, 11, 4, 5, 6, 7, 15, 14,
70 13, 12, 19, 18, 24, 25, 32, 33,
71 26, 27, 20, 21, 22, 23, 28, 29,
72 30, 31, 34, 35, 40, 41, 48, 49,
73 42, 43, 36, 37, 38, 39, 44, 45,
74 46, 47, 50, 51, 56, 57, 58, 59,
75 52, 53, 54, 55, 60, 61, 62, 63,
76 };
77
78 const uint8_t ff_alternate_vertical_scan[64] = {
79 0, 8, 16, 24, 1, 9, 2, 10,
80 17, 25, 32, 40, 48, 56, 57, 49,
81 41, 33, 26, 18, 3, 11, 4, 12,
82 19, 27, 34, 42, 50, 58, 35, 43,
83 51, 59, 20, 28, 5, 13, 6, 14,
84 21, 29, 36, 44, 52, 60, 37, 45,
85 53, 61, 22, 30, 7, 15, 23, 31,
86 38, 46, 54, 62, 39, 47, 55, 63,
87 };
88
89 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
90 const uint32_t inverse[256]={
91 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
92 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
93 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
94 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
95 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
96 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
97 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
98 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
99 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
100 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
101 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
102 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
103 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
104 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
105 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
106 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
107 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
108 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
109 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
110 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
111 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
112 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
113 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
114 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
115 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
116 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
117 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
118 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
119 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
120 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
121 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
122 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
123 };
124
125 /* Input permutation for the simple_idct_mmx */
126 static const uint8_t simple_mmx_permutation[64]={
127 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
128 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
129 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
130 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
131 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
132 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
133 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
134 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
135 };
136
137 static int pix_sum_c(uint8_t * pix, int line_size)
138 {
139 int s, i, j;
140
141 s = 0;
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
144 s += pix[0];
145 s += pix[1];
146 s += pix[2];
147 s += pix[3];
148 s += pix[4];
149 s += pix[5];
150 s += pix[6];
151 s += pix[7];
152 pix += 8;
153 }
154 pix += line_size - 16;
155 }
156 return s;
157 }
158
159 static int pix_norm1_c(uint8_t * pix, int line_size)
160 {
161 int s, i, j;
162 uint32_t *sq = squareTbl + 256;
163
164 s = 0;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
167 #if 0
168 s += sq[pix[0]];
169 s += sq[pix[1]];
170 s += sq[pix[2]];
171 s += sq[pix[3]];
172 s += sq[pix[4]];
173 s += sq[pix[5]];
174 s += sq[pix[6]];
175 s += sq[pix[7]];
176 #else
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
179 s += sq[x&0xff];
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
187 #else
188 register uint32_t x=*(uint32_t*)pix;
189 s += sq[x&0xff];
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
194 s += sq[x&0xff];
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
198 #endif
199 #endif
200 pix += 8;
201 }
202 pix += line_size - 16;
203 }
204 return s;
205 }
206
207 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
208 int i;
209
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= bswap_32(src[i+0]);
212 dst[i+1]= bswap_32(src[i+1]);
213 dst[i+2]= bswap_32(src[i+2]);
214 dst[i+3]= bswap_32(src[i+3]);
215 dst[i+4]= bswap_32(src[i+4]);
216 dst[i+5]= bswap_32(src[i+5]);
217 dst[i+6]= bswap_32(src[i+6]);
218 dst[i+7]= bswap_32(src[i+7]);
219 }
220 for(;i<w; i++){
221 dst[i+0]= bswap_32(src[i+0]);
222 }
223 }
224
225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
226 {
227 int s, i;
228 uint32_t *sq = squareTbl + 256;
229
230 s = 0;
231 for (i = 0; i < h; i++) {
232 s += sq[pix1[0] - pix2[0]];
233 s += sq[pix1[1] - pix2[1]];
234 s += sq[pix1[2] - pix2[2]];
235 s += sq[pix1[3] - pix2[3]];
236 pix1 += line_size;
237 pix2 += line_size;
238 }
239 return s;
240 }
241
242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243 {
244 int s, i;
245 uint32_t *sq = squareTbl + 256;
246
247 s = 0;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 s += sq[pix1[4] - pix2[4]];
254 s += sq[pix1[5] - pix2[5]];
255 s += sq[pix1[6] - pix2[6]];
256 s += sq[pix1[7] - pix2[7]];
257 pix1 += line_size;
258 pix2 += line_size;
259 }
260 return s;
261 }
262
263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
264 {
265 int s, i;
266 uint32_t *sq = squareTbl + 256;
267
268 s = 0;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[ 0] - pix2[ 0]];
271 s += sq[pix1[ 1] - pix2[ 1]];
272 s += sq[pix1[ 2] - pix2[ 2]];
273 s += sq[pix1[ 3] - pix2[ 3]];
274 s += sq[pix1[ 4] - pix2[ 4]];
275 s += sq[pix1[ 5] - pix2[ 5]];
276 s += sq[pix1[ 6] - pix2[ 6]];
277 s += sq[pix1[ 7] - pix2[ 7]];
278 s += sq[pix1[ 8] - pix2[ 8]];
279 s += sq[pix1[ 9] - pix2[ 9]];
280 s += sq[pix1[10] - pix2[10]];
281 s += sq[pix1[11] - pix2[11]];
282 s += sq[pix1[12] - pix2[12]];
283 s += sq[pix1[13] - pix2[13]];
284 s += sq[pix1[14] - pix2[14]];
285 s += sq[pix1[15] - pix2[15]];
286
287 pix1 += line_size;
288 pix2 += line_size;
289 }
290 return s;
291 }
292
293
294 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
295 #ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
296 int s, i, j;
297 const int dec_count= w==8 ? 3 : 4;
298 int tmp[16*16];
299 #if 0
300 int level, ori;
301 static const int scale[2][2][4][4]={
302 {
303 {
304 //8x8 dec=3
305 {268, 239, 239, 213},
306 { 0, 224, 224, 152},
307 { 0, 135, 135, 110},
308 },{
309 //16x16 dec=4
310 {344, 310, 310, 280},
311 { 0, 320, 320, 228},
312 { 0, 175, 175, 136},
313 { 0, 129, 129, 102},
314 }
315 },{
316 {//FIXME 5/3
317 //8x8 dec=3
318 {275, 245, 245, 218},
319 { 0, 230, 230, 156},
320 { 0, 138, 138, 113},
321 },{
322 //16x16 dec=4
323 {352, 317, 317, 286},
324 { 0, 328, 328, 233},
325 { 0, 180, 180, 140},
326 { 0, 132, 132, 105},
327 }
328 }
329 };
330 #endif
331
332 for (i = 0; i < h; i++) {
333 for (j = 0; j < w; j+=4) {
334 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
335 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
336 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
337 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
338 }
339 pix1 += line_size;
340 pix2 += line_size;
341 }
342
343 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
344
345 s=0;
346 #if 0
347 for(level=0; level<dec_count; level++){
348 for(ori= level ? 1 : 0; ori<4; ori++){
349 int sx= (ori&1) ? 1<<level: 0;
350 int stride= 16<<(dec_count-level);
351 int sy= (ori&2) ? stride>>1 : 0;
352 int size= 1<<level;
353
354 for(i=0; i<size; i++){
355 for(j=0; j<size; j++){
356 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
357 s += ABS(v);
358 }
359 }
360 }
361 }
362 #endif
363 for (i = 0; i < h; i++) {
364 for (j = 0; j < w; j+=4) {
365 s+= ABS(tmp[16*i+j+0]);
366 s+= ABS(tmp[16*i+j+1]);
367 s+= ABS(tmp[16*i+j+2]);
368 s+= ABS(tmp[16*i+j+3]);
369 }
370 }
371 assert(s>=0);
372
373 return s>>2;
374 #endif
375 }
376
377 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378 return w_c(v, pix1, pix2, line_size, 8, h, 1);
379 }
380
381 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382 return w_c(v, pix1, pix2, line_size, 8, h, 0);
383 }
384
385 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386 return w_c(v, pix1, pix2, line_size, 16, h, 1);
387 }
388
389 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
390 return w_c(v, pix1, pix2, line_size, 16, h, 0);
391 }
392
393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
394 {
395 int i;
396
397 /* read the pixels */
398 for(i=0;i<8;i++) {
399 block[0] = pixels[0];
400 block[1] = pixels[1];
401 block[2] = pixels[2];
402 block[3] = pixels[3];
403 block[4] = pixels[4];
404 block[5] = pixels[5];
405 block[6] = pixels[6];
406 block[7] = pixels[7];
407 pixels += line_size;
408 block += 8;
409 }
410 }
411
412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
413 const uint8_t *s2, int stride){
414 int i;
415
416 /* read the pixels */
417 for(i=0;i<8;i++) {
418 block[0] = s1[0] - s2[0];
419 block[1] = s1[1] - s2[1];
420 block[2] = s1[2] - s2[2];
421 block[3] = s1[3] - s2[3];
422 block[4] = s1[4] - s2[4];
423 block[5] = s1[5] - s2[5];
424 block[6] = s1[6] - s2[6];
425 block[7] = s1[7] - s2[7];
426 s1 += stride;
427 s2 += stride;
428 block += 8;
429 }
430 }
431
432
433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
434 int line_size)
435 {
436 int i;
437 uint8_t *cm = cropTbl + MAX_NEG_CROP;
438
439 /* read the pixels */
440 for(i=0;i<8;i++) {
441 pixels[0] = cm[block[0]];
442 pixels[1] = cm[block[1]];
443 pixels[2] = cm[block[2]];
444 pixels[3] = cm[block[3]];
445 pixels[4] = cm[block[4]];
446 pixels[5] = cm[block[5]];
447 pixels[6] = cm[block[6]];
448 pixels[7] = cm[block[7]];
449
450 pixels += line_size;
451 block += 8;
452 }
453 }
454
455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
456 int line_size)
457 {
458 int i;
459 uint8_t *cm = cropTbl + MAX_NEG_CROP;
460
461 /* read the pixels */
462 for(i=0;i<4;i++) {
463 pixels[0] = cm[block[0]];
464 pixels[1] = cm[block[1]];
465 pixels[2] = cm[block[2]];
466 pixels[3] = cm[block[3]];
467
468 pixels += line_size;
469 block += 8;
470 }
471 }
472
473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
474 int line_size)
475 {
476 int i;
477 uint8_t *cm = cropTbl + MAX_NEG_CROP;
478
479 /* read the pixels */
480 for(i=0;i<2;i++) {
481 pixels[0] = cm[block[0]];
482 pixels[1] = cm[block[1]];
483
484 pixels += line_size;
485 block += 8;
486 }
487 }
488
489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
490 uint8_t *restrict pixels,
491 int line_size)
492 {
493 int i, j;
494
495 for (i = 0; i < 8; i++) {
496 for (j = 0; j < 8; j++) {
497 if (*block < -128)
498 *pixels = 0;
499 else if (*block > 127)
500 *pixels = 255;
501 else
502 *pixels = (uint8_t)(*block + 128);
503 block++;
504 pixels++;
505 }
506 pixels += (line_size - 8);
507 }
508 }
509
510 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
511 int line_size)
512 {
513 int i;
514 uint8_t *cm = cropTbl + MAX_NEG_CROP;
515
516 /* read the pixels */
517 for(i=0;i<8;i++) {
518 pixels[0] = cm[pixels[0] + block[0]];
519 pixels[1] = cm[pixels[1] + block[1]];
520 pixels[2] = cm[pixels[2] + block[2]];
521 pixels[3] = cm[pixels[3] + block[3]];
522 pixels[4] = cm[pixels[4] + block[4]];
523 pixels[5] = cm[pixels[5] + block[5]];
524 pixels[6] = cm[pixels[6] + block[6]];
525 pixels[7] = cm[pixels[7] + block[7]];
526 pixels += line_size;
527 block += 8;
528 }
529 }
530
531 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
532 int line_size)
533 {
534 int i;
535 uint8_t *cm = cropTbl + MAX_NEG_CROP;
536
537 /* read the pixels */
538 for(i=0;i<4;i++) {
539 pixels[0] = cm[pixels[0] + block[0]];
540 pixels[1] = cm[pixels[1] + block[1]];
541 pixels[2] = cm[pixels[2] + block[2]];
542 pixels[3] = cm[pixels[3] + block[3]];
543 pixels += line_size;
544 block += 8;
545 }
546 }
547
548 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
549 int line_size)
550 {
551 int i;
552 uint8_t *cm = cropTbl + MAX_NEG_CROP;
553
554 /* read the pixels */
555 for(i=0;i<2;i++) {
556 pixels[0] = cm[pixels[0] + block[0]];
557 pixels[1] = cm[pixels[1] + block[1]];
558 pixels += line_size;
559 block += 8;
560 }
561 }
562 #if 0
563
564 #define PIXOP2(OPNAME, OP) \
565 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
566 {\
567 int i;\
568 for(i=0; i<h; i++){\
569 OP(*((uint64_t*)block), LD64(pixels));\
570 pixels+=line_size;\
571 block +=line_size;\
572 }\
573 }\
574 \
575 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
576 {\
577 int i;\
578 for(i=0; i<h; i++){\
579 const uint64_t a= LD64(pixels );\
580 const uint64_t b= LD64(pixels+1);\
581 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
582 pixels+=line_size;\
583 block +=line_size;\
584 }\
585 }\
586 \
587 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
588 {\
589 int i;\
590 for(i=0; i<h; i++){\
591 const uint64_t a= LD64(pixels );\
592 const uint64_t b= LD64(pixels+1);\
593 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
594 pixels+=line_size;\
595 block +=line_size;\
596 }\
597 }\
598 \
599 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
600 {\
601 int i;\
602 for(i=0; i<h; i++){\
603 const uint64_t a= LD64(pixels );\
604 const uint64_t b= LD64(pixels+line_size);\
605 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
606 pixels+=line_size;\
607 block +=line_size;\
608 }\
609 }\
610 \
611 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
612 {\
613 int i;\
614 for(i=0; i<h; i++){\
615 const uint64_t a= LD64(pixels );\
616 const uint64_t b= LD64(pixels+line_size);\
617 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
618 pixels+=line_size;\
619 block +=line_size;\
620 }\
621 }\
622 \
623 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
624 {\
625 int i;\
626 const uint64_t a= LD64(pixels );\
627 const uint64_t b= LD64(pixels+1);\
628 uint64_t l0= (a&0x0303030303030303ULL)\
629 + (b&0x0303030303030303ULL)\
630 + 0x0202020202020202ULL;\
631 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
632 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
633 uint64_t l1,h1;\
634 \
635 pixels+=line_size;\
636 for(i=0; i<h; i+=2){\
637 uint64_t a= LD64(pixels );\
638 uint64_t b= LD64(pixels+1);\
639 l1= (a&0x0303030303030303ULL)\
640 + (b&0x0303030303030303ULL);\
641 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
642 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
643 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
644 pixels+=line_size;\
645 block +=line_size;\
646 a= LD64(pixels );\
647 b= LD64(pixels+1);\
648 l0= (a&0x0303030303030303ULL)\
649 + (b&0x0303030303030303ULL)\
650 + 0x0202020202020202ULL;\
651 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
652 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
653 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
654 pixels+=line_size;\
655 block +=line_size;\
656 }\
657 }\
658 \
659 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
660 {\
661 int i;\
662 const uint64_t a= LD64(pixels );\
663 const uint64_t b= LD64(pixels+1);\
664 uint64_t l0= (a&0x0303030303030303ULL)\
665 + (b&0x0303030303030303ULL)\
666 + 0x0101010101010101ULL;\
667 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
668 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
669 uint64_t l1,h1;\
670 \
671 pixels+=line_size;\
672 for(i=0; i<h; i+=2){\
673 uint64_t a= LD64(pixels );\
674 uint64_t b= LD64(pixels+1);\
675 l1= (a&0x0303030303030303ULL)\
676 + (b&0x0303030303030303ULL);\
677 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
678 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
679 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
680 pixels+=line_size;\
681 block +=line_size;\
682 a= LD64(pixels );\
683 b= LD64(pixels+1);\
684 l0= (a&0x0303030303030303ULL)\
685 + (b&0x0303030303030303ULL)\
686 + 0x0101010101010101ULL;\
687 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
688 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
689 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
690 pixels+=line_size;\
691 block +=line_size;\
692 }\
693 }\
694 \
695 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
696 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
697 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
698 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
699 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
700 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
701 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
702
703 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
704 #else // 64 bit variant
705
706 #define PIXOP2(OPNAME, OP) \
707 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
708 int i;\
709 for(i=0; i<h; i++){\
710 OP(*((uint16_t*)(block )), LD16(pixels ));\
711 pixels+=line_size;\
712 block +=line_size;\
713 }\
714 }\
715 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
716 int i;\
717 for(i=0; i<h; i++){\
718 OP(*((uint32_t*)(block )), LD32(pixels ));\
719 pixels+=line_size;\
720 block +=line_size;\
721 }\
722 }\
723 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
724 int i;\
725 for(i=0; i<h; i++){\
726 OP(*((uint32_t*)(block )), LD32(pixels ));\
727 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
728 pixels+=line_size;\
729 block +=line_size;\
730 }\
731 }\
732 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
733 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
734 }\
735 \
736 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
737 int src_stride1, int src_stride2, int h){\
738 int i;\
739 for(i=0; i<h; i++){\
740 uint32_t a,b;\
741 a= LD32(&src1[i*src_stride1 ]);\
742 b= LD32(&src2[i*src_stride2 ]);\
743 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
744 a= LD32(&src1[i*src_stride1+4]);\
745 b= LD32(&src2[i*src_stride2+4]);\
746 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
747 }\
748 }\
749 \
750 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
751 int src_stride1, int src_stride2, int h){\
752 int i;\
753 for(i=0; i<h; i++){\
754 uint32_t a,b;\
755 a= LD32(&src1[i*src_stride1 ]);\
756 b= LD32(&src2[i*src_stride2 ]);\
757 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
758 a= LD32(&src1[i*src_stride1+4]);\
759 b= LD32(&src2[i*src_stride2+4]);\
760 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
761 }\
762 }\
763 \
764 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
765 int src_stride1, int src_stride2, int h){\
766 int i;\
767 for(i=0; i<h; i++){\
768 uint32_t a,b;\
769 a= LD32(&src1[i*src_stride1 ]);\
770 b= LD32(&src2[i*src_stride2 ]);\
771 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
772 }\
773 }\
774 \
775 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
776 int src_stride1, int src_stride2, int h){\
777 int i;\
778 for(i=0; i<h; i++){\
779 uint32_t a,b;\
780 a= LD16(&src1[i*src_stride1 ]);\
781 b= LD16(&src2[i*src_stride2 ]);\
782 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
783 }\
784 }\
785 \
786 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
787 int src_stride1, int src_stride2, int h){\
788 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
789 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
790 }\
791 \
792 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793 int src_stride1, int src_stride2, int h){\
794 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
795 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
796 }\
797 \
798 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
799 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
800 }\
801 \
802 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
803 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
804 }\
805 \
806 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
807 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
808 }\
809 \
810 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
811 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
812 }\
813 \
814 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
815 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
816 int i;\
817 for(i=0; i<h; i++){\
818 uint32_t a, b, c, d, l0, l1, h0, h1;\
819 a= LD32(&src1[i*src_stride1]);\
820 b= LD32(&src2[i*src_stride2]);\
821 c= LD32(&src3[i*src_stride3]);\
822 d= LD32(&src4[i*src_stride4]);\
823 l0= (a&0x03030303UL)\
824 + (b&0x03030303UL)\
825 + 0x02020202UL;\
826 h0= ((a&0xFCFCFCFCUL)>>2)\
827 + ((b&0xFCFCFCFCUL)>>2);\
828 l1= (c&0x03030303UL)\
829 + (d&0x03030303UL);\
830 h1= ((c&0xFCFCFCFCUL)>>2)\
831 + ((d&0xFCFCFCFCUL)>>2);\
832 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
833 a= LD32(&src1[i*src_stride1+4]);\
834 b= LD32(&src2[i*src_stride2+4]);\
835 c= LD32(&src3[i*src_stride3+4]);\
836 d= LD32(&src4[i*src_stride4+4]);\
837 l0= (a&0x03030303UL)\
838 + (b&0x03030303UL)\
839 + 0x02020202UL;\
840 h0= ((a&0xFCFCFCFCUL)>>2)\
841 + ((b&0xFCFCFCFCUL)>>2);\
842 l1= (c&0x03030303UL)\
843 + (d&0x03030303UL);\
844 h1= ((c&0xFCFCFCFCUL)>>2)\
845 + ((d&0xFCFCFCFCUL)>>2);\
846 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
847 }\
848 }\
849 \
850 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
851 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
852 }\
853 \
854 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
855 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
856 }\
857 \
858 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
859 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
860 }\
861 \
862 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
863 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
864 }\
865 \
866 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
867 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
868 int i;\
869 for(i=0; i<h; i++){\
870 uint32_t a, b, c, d, l0, l1, h0, h1;\
871 a= LD32(&src1[i*src_stride1]);\
872 b= LD32(&src2[i*src_stride2]);\
873 c= LD32(&src3[i*src_stride3]);\
874 d= LD32(&src4[i*src_stride4]);\
875 l0= (a&0x03030303UL)\
876 + (b&0x03030303UL)\
877 + 0x01010101UL;\
878 h0= ((a&0xFCFCFCFCUL)>>2)\
879 + ((b&0xFCFCFCFCUL)>>2);\
880 l1= (c&0x03030303UL)\
881 + (d&0x03030303UL);\
882 h1= ((c&0xFCFCFCFCUL)>>2)\
883 + ((d&0xFCFCFCFCUL)>>2);\
884 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
885 a= LD32(&src1[i*src_stride1+4]);\
886 b= LD32(&src2[i*src_stride2+4]);\
887 c= LD32(&src3[i*src_stride3+4]);\
888 d= LD32(&src4[i*src_stride4+4]);\
889 l0= (a&0x03030303UL)\
890 + (b&0x03030303UL)\
891 + 0x01010101UL;\
892 h0= ((a&0xFCFCFCFCUL)>>2)\
893 + ((b&0xFCFCFCFCUL)>>2);\
894 l1= (c&0x03030303UL)\
895 + (d&0x03030303UL);\
896 h1= ((c&0xFCFCFCFCUL)>>2)\
897 + ((d&0xFCFCFCFCUL)>>2);\
898 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
899 }\
900 }\
901 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
902 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
903 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
904 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
905 }\
906 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
907 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
908 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
909 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
910 }\
911 \
912 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
913 {\
914 int i, a0, b0, a1, b1;\
915 a0= pixels[0];\
916 b0= pixels[1] + 2;\
917 a0 += b0;\
918 b0 += pixels[2];\
919 \
920 pixels+=line_size;\
921 for(i=0; i<h; i+=2){\
922 a1= pixels[0];\
923 b1= pixels[1];\
924 a1 += b1;\
925 b1 += pixels[2];\
926 \
927 block[0]= (a1+a0)>>2; /* FIXME non put */\
928 block[1]= (b1+b0)>>2;\
929 \
930 pixels+=line_size;\
931 block +=line_size;\
932 \
933 a0= pixels[0];\
934 b0= pixels[1] + 2;\
935 a0 += b0;\
936 b0 += pixels[2];\
937 \
938 block[0]= (a1+a0)>>2;\
939 block[1]= (b1+b0)>>2;\
940 pixels+=line_size;\
941 block +=line_size;\
942 }\
943 }\
944 \
945 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
946 {\
947 int i;\
948 const uint32_t a= LD32(pixels );\
949 const uint32_t b= LD32(pixels+1);\
950 uint32_t l0= (a&0x03030303UL)\
951 + (b&0x03030303UL)\
952 + 0x02020202UL;\
953 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
954 + ((b&0xFCFCFCFCUL)>>2);\
955 uint32_t l1,h1;\
956 \
957 pixels+=line_size;\
958 for(i=0; i<h; i+=2){\
959 uint32_t a= LD32(pixels );\
960 uint32_t b= LD32(pixels+1);\
961 l1= (a&0x03030303UL)\
962 + (b&0x03030303UL);\
963 h1= ((a&0xFCFCFCFCUL)>>2)\
964 + ((b&0xFCFCFCFCUL)>>2);\
965 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
966 pixels+=line_size;\
967 block +=line_size;\
968 a= LD32(pixels );\
969 b= LD32(pixels+1);\
970 l0= (a&0x03030303UL)\
971 + (b&0x03030303UL)\
972 + 0x02020202UL;\
973 h0= ((a&0xFCFCFCFCUL)>>2)\
974 + ((b&0xFCFCFCFCUL)>>2);\
975 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
976 pixels+=line_size;\
977 block +=line_size;\
978 }\
979 }\
980 \
981 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
982 {\
983 int j;\
984 for(j=0; j<2; j++){\
985 int i;\
986 const uint32_t a= LD32(pixels );\
987 const uint32_t b= LD32(pixels+1);\
988 uint32_t l0= (a&0x03030303UL)\
989 + (b&0x03030303UL)\
990 + 0x02020202UL;\
991 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
992 + ((b&0xFCFCFCFCUL)>>2);\
993 uint32_t l1,h1;\
994 \
995 pixels+=line_size;\
996 for(i=0; i<h; i+=2){\
997 uint32_t a= LD32(pixels );\
998 uint32_t b= LD32(pixels+1);\
999 l1= (a&0x03030303UL)\
1000 + (b&0x03030303UL);\
1001 h1= ((a&0xFCFCFCFCUL)>>2)\
1002 + ((b&0xFCFCFCFCUL)>>2);\
1003 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004 pixels+=line_size;\
1005 block +=line_size;\
1006 a= LD32(pixels );\
1007 b= LD32(pixels+1);\
1008 l0= (a&0x03030303UL)\
1009 + (b&0x03030303UL)\
1010 + 0x02020202UL;\
1011 h0= ((a&0xFCFCFCFCUL)>>2)\
1012 + ((b&0xFCFCFCFCUL)>>2);\
1013 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1014 pixels+=line_size;\
1015 block +=line_size;\
1016 }\
1017 pixels+=4-line_size*(h+1);\
1018 block +=4-line_size*h;\
1019 }\
1020 }\
1021 \
1022 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1023 {\
1024 int j;\
1025 for(j=0; j<2; j++){\
1026 int i;\
1027 const uint32_t a= LD32(pixels );\
1028 const uint32_t b= LD32(pixels+1);\
1029 uint32_t l0= (a&0x03030303UL)\
1030 + (b&0x03030303UL)\
1031 + 0x01010101UL;\
1032 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1033 + ((b&0xFCFCFCFCUL)>>2);\
1034 uint32_t l1,h1;\
1035 \
1036 pixels+=line_size;\
1037 for(i=0; i<h; i+=2){\
1038 uint32_t a= LD32(pixels );\
1039 uint32_t b= LD32(pixels+1);\
1040 l1= (a&0x03030303UL)\
1041 + (b&0x03030303UL);\
1042 h1= ((a&0xFCFCFCFCUL)>>2)\
1043 + ((b&0xFCFCFCFCUL)>>2);\
1044 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045 pixels+=line_size;\
1046 block +=line_size;\
1047 a= LD32(pixels );\
1048 b= LD32(pixels+1);\
1049 l0= (a&0x03030303UL)\
1050 + (b&0x03030303UL)\
1051 + 0x01010101UL;\
1052 h0= ((a&0xFCFCFCFCUL)>>2)\
1053 + ((b&0xFCFCFCFCUL)>>2);\
1054 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1055 pixels+=line_size;\
1056 block +=line_size;\
1057 }\
1058 pixels+=4-line_size*(h+1);\
1059 block +=4-line_size*h;\
1060 }\
1061 }\
1062 \
1063 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1064 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1065 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1066 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1067 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1068 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1069 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1070 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1071
1072 #define op_avg(a, b) a = rnd_avg32(a, b)
1073 #endif
1074 #define op_put(a, b) a = b
1075
1076 PIXOP2(avg, op_avg)
1077 PIXOP2(put, op_put)
1078 #undef op_avg
1079 #undef op_put
1080
1081 #define avg2(a,b) ((a+b+1)>>1)
1082 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1083
1084 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1085 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1086 }
1087
1088 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1089 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1090 }
1091
1092 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1093 {
1094 const int A=(16-x16)*(16-y16);
1095 const int B=( x16)*(16-y16);
1096 const int C=(16-x16)*( y16);
1097 const int D=( x16)*( y16);
1098 int i;
1099
1100 for(i=0; i<h; i++)
1101 {
1102 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1103 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1104 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1105 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1106 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1107 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1108 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1109 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1110 dst+= stride;
1111 src+= stride;
1112 }
1113 }
1114
1115 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1116 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1117 {
1118 int y, vx, vy;
1119 const int s= 1<<shift;
1120
1121 width--;
1122 height--;
1123
1124 for(y=0; y<h; y++){
1125 int x;
1126
1127 vx= ox;
1128 vy= oy;
1129 for(x=0; x<8; x++){ //XXX FIXME optimize
1130 int src_x, src_y, frac_x, frac_y, index;
1131
1132 src_x= vx>>16;
1133 src_y= vy>>16;
1134 frac_x= src_x&(s-1);
1135 frac_y= src_y&(s-1);
1136 src_x>>=shift;
1137 src_y>>=shift;
1138
1139 if((unsigned)src_x < width){
1140 if((unsigned)src_y < height){
1141 index= src_x + src_y*stride;
1142 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1143 + src[index +1]* frac_x )*(s-frac_y)
1144 + ( src[index+stride ]*(s-frac_x)
1145 + src[index+stride+1]* frac_x )* frac_y
1146 + r)>>(shift*2);
1147 }else{
1148 index= src_x + clip(src_y, 0, height)*stride;
1149 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1150 + src[index +1]* frac_x )*s
1151 + r)>>(shift*2);
1152 }
1153 }else{
1154 if((unsigned)src_y < height){
1155 index= clip(src_x, 0, width) + src_y*stride;
1156 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1157 + src[index+stride ]* frac_y )*s
1158 + r)>>(shift*2);
1159 }else{
1160 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1161 dst[y*stride + x]= src[index ];
1162 }
1163 }
1164
1165 vx+= dxx;
1166 vy+= dyx;
1167 }
1168 ox += dxy;
1169 oy += dyy;
1170 }
1171 }
1172
1173 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1174 switch(width){
1175 case 2: put_pixels2_c (dst, src, stride, height); break;
1176 case 4: put_pixels4_c (dst, src, stride, height); break;
1177 case 8: put_pixels8_c (dst, src, stride, height); break;
1178 case 16:put_pixels16_c(dst, src, stride, height); break;
1179 }
1180 }
1181
1182 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1183 int i,j;
1184 for (i=0; i < height; i++) {
1185 for (j=0; j < width; j++) {
1186 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1187 }
1188 src += stride;
1189 dst += stride;
1190 }
1191 }
1192
1193 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1194 int i,j;
1195 for (i=0; i < height; i++) {
1196 for (j=0; j < width; j++) {
1197 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1198 }
1199 src += stride;
1200 dst += stride;
1201 }
1202 }
1203
1204 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1205 int i,j;
1206 for (i=0; i < height; i++) {
1207 for (j=0; j < width; j++) {
1208 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1209 }
1210 src += stride;
1211 dst += stride;
1212 }
1213 }
1214
1215 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216 int i,j;
1217 for (i=0; i < height; i++) {
1218 for (j=0; j < width; j++) {
1219 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1220 }
1221 src += stride;
1222 dst += stride;
1223 }
1224 }
1225
1226 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1227 int i,j;
1228 for (i=0; i < height; i++) {
1229 for (j=0; j < width; j++) {
1230 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1231 }
1232 src += stride;
1233 dst += stride;
1234 }
1235 }
1236
1237 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1238 int i,j;
1239 for (i=0; i < height; i++) {
1240 for (j=0; j < width; j++) {
1241 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1242 }
1243 src += stride;
1244 dst += stride;
1245 }
1246 }
1247
1248 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1249 int i,j;
1250 for (i=0; i < height; i++) {
1251 for (j=0; j < width; j++) {
1252 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1253 }
1254 src += stride;
1255 dst += stride;
1256 }
1257 }
1258
1259 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1260 int i,j;
1261 for (i=0; i < height; i++) {
1262 for (j=0; j < width; j++) {
1263 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1264 }
1265 src += stride;
1266 dst += stride;
1267 }
1268 }
1269
1270 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271 switch(width){
1272 case 2: avg_pixels2_c (dst, src, stride, height); break;
1273 case 4: avg_pixels4_c (dst, src, stride, height); break;
1274 case 8: avg_pixels8_c (dst, src, stride, height); break;
1275 case 16:avg_pixels16_c(dst, src, stride, height); break;
1276 }
1277 }
1278
1279 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280 int i,j;
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
1283 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1284 }
1285 src += stride;
1286 dst += stride;
1287 }
1288 }
1289
1290 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291 int i,j;
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
1294 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1295 }
1296 src += stride;
1297 dst += stride;
1298 }
1299 }
1300
1301 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302 int i,j;
1303 for (i=0; i < height; i++) {
1304 for (j=0; j < width; j++) {
1305 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1306 }
1307 src += stride;
1308 dst += stride;
1309 }
1310 }
1311
1312 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313 int i,j;
1314 for (i=0; i < height; i++) {
1315 for (j=0; j < width; j++) {
1316 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1317 }
1318 src += stride;
1319 dst += stride;
1320 }
1321 }
1322
1323 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324 int i,j;
1325 for (i=0; i < height; i++) {
1326 for (j=0; j < width; j++) {
1327 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1328 }
1329 src += stride;
1330 dst += stride;
1331 }
1332 }
1333
1334 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335 int i,j;
1336 for (i=0; i < height; i++) {
1337 for (j=0; j < width; j++) {
1338 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1339 }
1340 src += stride;
1341 dst += stride;
1342 }
1343 }
1344
1345 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346 int i,j;
1347 for (i=0; i < height; i++) {
1348 for (j=0; j < width; j++) {
1349 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1350 }
1351 src += stride;
1352 dst += stride;
1353 }
1354 }
1355
1356 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357 int i,j;
1358 for (i=0; i < height; i++) {
1359 for (j=0; j < width; j++) {
1360 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1361 }
1362 src += stride;
1363 dst += stride;
1364 }
1365 }
1366 #if 0
1367 #define TPEL_WIDTH(width)\
1368 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1369 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1370 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1371 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1372 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1373 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1374 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1375 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1376 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1377 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1378 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1379 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1380 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1381 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1382 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1383 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1384 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1385 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1386 #endif
1387
1388 #define H264_CHROMA_MC(OPNAME, OP)\
1389 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1390 const int A=(8-x)*(8-y);\
1391 const int B=( x)*(8-y);\
1392 const int C=(8-x)*( y);\
1393 const int D=( x)*( y);\
1394 int i;\
1395 \
1396 assert(x<8 && y<8 && x>=0 && y>=0);\
1397 \
1398 for(i=0; i<h; i++)\
1399 {\
1400 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1401 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1402 dst+= stride;\
1403 src+= stride;\
1404 }\
1405 }\
1406 \
1407 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1408 const int A=(8-x)*(8-y);\
1409 const int B=( x)*(8-y);\
1410 const int C=(8-x)*( y);\
1411 const int D=( x)*( y);\
1412 int i;\
1413 \
1414 assert(x<8 && y<8 && x>=0 && y>=0);\
1415 \
1416 for(i=0; i<h; i++)\
1417 {\
1418 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1419 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1420 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1421 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1422 dst+= stride;\
1423 src+= stride;\
1424 }\
1425 }\
1426 \
1427 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1428 const int A=(8-x)*(8-y);\
1429 const int B=( x)*(8-y);\
1430 const int C=(8-x)*( y);\
1431 const int D=( x)*( y);\
1432 int i;\
1433 \
1434 assert(x<8 && y<8 && x>=0 && y>=0);\
1435 \
1436 for(i=0; i<h; i++)\
1437 {\
1438 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1439 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1440 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1441 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1442 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1443 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1444 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1445 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1446 dst+= stride;\
1447 src+= stride;\
1448 }\
1449 }
1450
1451 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1452 #define op_put(a, b) a = (((b) + 32)>>6)
1453
1454 H264_CHROMA_MC(put_ , op_put)
1455 H264_CHROMA_MC(avg_ , op_avg)
1456 #undef op_avg
1457 #undef op_put
1458
1459 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1460 {
1461 int i;
1462 for(i=0; i<h; i++)
1463 {
1464 ST32(dst , LD32(src ));
1465 dst+=dstStride;
1466 src+=srcStride;
1467 }
1468 }
1469
1470 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1471 {
1472 int i;
1473 for(i=0; i<h; i++)
1474 {
1475 ST32(dst , LD32(src ));
1476 ST32(dst+4 , LD32(src+4 ));
1477 dst+=dstStride;
1478 src+=srcStride;
1479 }
1480 }
1481
1482 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1483 {
1484 int i;
1485 for(i=0; i<h; i++)
1486 {
1487 ST32(dst , LD32(src ));
1488 ST32(dst+4 , LD32(src+4 ));
1489 ST32(dst+8 , LD32(src+8 ));
1490 ST32(dst+12, LD32(src+12));
1491 dst+=dstStride;
1492 src+=srcStride;
1493 }
1494 }
1495
1496 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1497 {
1498 int i;
1499 for(i=0; i<h; i++)
1500 {
1501 ST32(dst , LD32(src ));
1502 ST32(dst+4 , LD32(src+4 ));
1503 ST32(dst+8 , LD32(src+8 ));
1504 ST32(dst+12, LD32(src+12));
1505 dst[16]= src[16];
1506 dst+=dstStride;
1507 src+=srcStride;
1508 }
1509 }
1510
1511 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1512 {
1513 int i;
1514 for(i=0; i<h; i++)
1515 {
1516 ST32(dst , LD32(src ));
1517 ST32(dst+4 , LD32(src+4 ));
1518 dst[8]= src[8];
1519 dst+=dstStride;
1520 src+=srcStride;
1521 }
1522 }
1523
1524
1525 #define QPEL_MC(r, OPNAME, RND, OP) \
1526 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1527 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1528 int i;\
1529 for(i=0; i<h; i++)\
1530 {\
1531 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1532 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1533 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1534 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1535 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1536 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1537 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1538 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1539 dst+=dstStride;\
1540 src+=srcStride;\
1541 }\
1542 }\
1543 \
1544 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1545 const int w=8;\
1546 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1547 int i;\
1548 for(i=0; i<w; i++)\
1549 {\
1550 const int src0= src[0*srcStride];\
1551 const int src1= src[1*srcStride];\
1552 const int src2= src[2*srcStride];\
1553 const int src3= src[3*srcStride];\
1554 const int src4= src[4*srcStride];\
1555 const int src5= src[5*srcStride];\
1556 const int src6= src[6*srcStride];\
1557 const int src7= src[7*srcStride];\
1558 const int src8= src[8*srcStride];\
1559 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1560 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1561 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1562 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1563 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1564 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1565 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1566 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1567 dst++;\
1568 src++;\
1569 }\
1570 }\
1571 \
1572 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1573 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1574 int i;\
1575 \
1576 for(i=0; i<h; i++)\
1577 {\
1578 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1579 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1580 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1581 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1582 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1583 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1584 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1585 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1586 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1587 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1588 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1589 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1590 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1591 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1592 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1593 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1594 dst+=dstStride;\
1595 src+=srcStride;\
1596 }\
1597 }\
1598 \
1599 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1600 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1601 int i;\
1602 const int w=16;\
1603 for(i=0; i<w; i++)\
1604 {\
1605 const int src0= src[0*srcStride];\
1606 const int src1= src[1*srcStride];\
1607 const int src2= src[2*srcStride];\
1608 const int src3= src[3*srcStride];\
1609 const int src4= src[4*srcStride];\
1610 const int src5= src[5*srcStride];\
1611 const int src6= src[6*srcStride];\
1612 const int src7= src[7*srcStride];\
1613 const int src8= src[8*srcStride];\
1614 const int src9= src[9*srcStride];\
1615 const int src10= src[10*srcStride];\
1616 const int src11= src[11*srcStride];\
1617 const int src12= src[12*srcStride];\
1618 const int src13= src[13*srcStride];\
1619 const int src14= src[14*srcStride];\
1620 const int src15= src[15*srcStride];\
1621 const int src16= src[16*srcStride];\
1622 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1623 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1624 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1625 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1626 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1627 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1628 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1629 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1630 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1631 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1632 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1633 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1634 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1635 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1636 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1637 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1638 dst++;\
1639 src++;\
1640 }\
1641 }\
1642 \
1643 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1644 OPNAME ## pixels8_c(dst, src, stride, 8);\
1645 }\
1646 \
1647 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1648 uint8_t half[64];\
1649 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1650 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1651 }\
1652 \
1653 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1654 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1655 }\
1656 \
1657 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1658 uint8_t half[64];\
1659 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1660 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1661 }\
1662 \
1663 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1664 uint8_t full[16*9];\
1665 uint8_t half[64];\
1666 copy_block9(full, src, 16, stride, 9);\
1667 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1668 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1669 }\
1670 \
1671 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1672 uint8_t full[16*9];\
1673 copy_block9(full, src, 16, stride, 9);\
1674 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1675 }\
1676 \
1677 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1678 uint8_t full[16*9];\
1679 uint8_t half[64];\
1680 copy_block9(full, src, 16, stride, 9);\
1681 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1682 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1683 }\
1684 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1685 uint8_t full[16*9];\
1686 uint8_t halfH[72];\
1687 uint8_t halfV[64];\
1688 uint8_t halfHV[64];\
1689 copy_block9(full, src, 16, stride, 9);\
1690 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1691 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1692 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1693 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1694 }\
1695 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1696 uint8_t full[16*9];\
1697 uint8_t halfH[72];\
1698 uint8_t halfHV[64];\
1699 copy_block9(full, src, 16, stride, 9);\
1700 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1701 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1702 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1703 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1704 }\
1705 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1706 uint8_t full[16*9];\
1707 uint8_t halfH[72];\
1708 uint8_t halfV[64];\
1709 uint8_t halfHV[64];\
1710 copy_block9(full, src, 16, stride, 9);\
1711 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1712 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1713 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1714 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1715 }\
1716 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1717 uint8_t full[16*9];\
1718 uint8_t halfH[72];\
1719 uint8_t halfHV[64];\
1720 copy_block9(full, src, 16, stride, 9);\
1721 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1722 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1723 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1724 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1725 }\
1726 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727 uint8_t full[16*9];\
1728 uint8_t halfH[72];\
1729 uint8_t halfV[64];\
1730 uint8_t halfHV[64];\
1731 copy_block9(full, src, 16, stride, 9);\
1732 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1736 }\
1737 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1738 uint8_t full[16*9];\
1739 uint8_t halfH[72];\
1740 uint8_t halfHV[64];\
1741 copy_block9(full, src, 16, stride, 9);\
1742 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1746 }\
1747 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748 uint8_t full[16*9];\
1749 uint8_t halfH[72];\
1750 uint8_t halfV[64];\
1751 uint8_t halfHV[64];\
1752 copy_block9(full, src, 16, stride, 9);\
1753 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1754 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1757 }\
1758 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1759 uint8_t full[16*9];\
1760 uint8_t halfH[72];\
1761 uint8_t halfHV[64];\
1762 copy_block9(full, src, 16, stride, 9);\
1763 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1767 }\
1768 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1769 uint8_t halfH[72];\
1770 uint8_t halfHV[64];\
1771 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1772 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1774 }\
1775 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1776 uint8_t halfH[72];\
1777 uint8_t halfHV[64];\
1778 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1781 }\
1782 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1783 uint8_t full[16*9];\
1784 uint8_t halfH[72];\
1785 uint8_t halfV[64];\
1786 uint8_t halfHV[64];\
1787 copy_block9(full, src, 16, stride, 9);\
1788 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1790 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1792 }\
1793 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1794 uint8_t full[16*9];\
1795 uint8_t halfH[72];\
1796 copy_block9(full, src, 16, stride, 9);\
1797 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1798 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1799 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1800 }\
1801 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1803 uint8_t halfH[72];\
1804 uint8_t halfV[64];\
1805 uint8_t halfHV[64];\
1806 copy_block9(full, src, 16, stride, 9);\
1807 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1809 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1811 }\
1812 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1813 uint8_t full[16*9];\
1814 uint8_t halfH[72];\
1815 copy_block9(full, src, 16, stride, 9);\
1816 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1818 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1819 }\
1820 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1821 uint8_t halfH[72];\
1822 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1823 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1824 }\
1825 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1826 OPNAME ## pixels16_c(dst, src, stride, 16);\
1827 }\
1828 \
1829 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1830 uint8_t half[256];\
1831 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1832 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1833 }\
1834 \
1835 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1836 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1837 }\
1838 \
1839 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1840 uint8_t half[256];\
1841 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1842 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1843 }\
1844 \
1845 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1846 uint8_t full[24*17];\
1847 uint8_t half[256];\
1848 copy_block17(full, src, 24, stride, 17);\
1849 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1850 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1851 }\
1852 \
1853 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t full[24*17];\
1855 copy_block17(full, src, 24, stride, 17);\
1856 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1857 }\
1858 \
1859 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t full[24*17];\
1861 uint8_t half[256];\
1862 copy_block17(full, src, 24, stride, 17);\
1863 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1864 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1865 }\
1866 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1867 uint8_t full[24*17];\
1868 uint8_t halfH[272];\
1869 uint8_t halfV[256];\
1870 uint8_t halfHV[256];\
1871 copy_block17(full, src, 24, stride, 17);\
1872 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1873 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1874 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1875 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1876 }\
1877 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1878 uint8_t full[24*17];\
1879 uint8_t halfH[272];\
1880 uint8_t halfHV[256];\
1881 copy_block17(full, src, 24, stride, 17);\
1882 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1883 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1884 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1885 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1886 }\
1887 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1888 uint8_t full[24*17];\
1889 uint8_t halfH[272];\
1890 uint8_t halfV[256];\
1891 uint8_t halfHV[256];\
1892 copy_block17(full, src, 24, stride, 17);\
1893 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1894 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1895 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1896 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1897 }\
1898 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t full[24*17];\
1900 uint8_t halfH[272];\
1901 uint8_t halfHV[256];\
1902 copy_block17(full, src, 24, stride, 17);\
1903 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1904 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1905 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1906 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1907 }\
1908 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909 uint8_t full[24*17];\
1910 uint8_t halfH[272];\
1911 uint8_t halfV[256];\
1912 uint8_t halfHV[256];\
1913 copy_block17(full, src, 24, stride, 17);\
1914 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1918 }\
1919 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1920 uint8_t full[24*17];\
1921 uint8_t halfH[272];\
1922 uint8_t halfHV[256];\
1923 copy_block17(full, src, 24, stride, 17);\
1924 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1928 }\
1929 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t full[24*17];\
1931 uint8_t halfH[272];\
1932 uint8_t halfV[256];\
1933 uint8_t halfHV[256];\
1934 copy_block17(full, src, 24, stride, 17);\
1935 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1936 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1939 }\
1940 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1941 uint8_t full[24*17];\
1942 uint8_t halfH[272];\
1943 uint8_t halfHV[256];\
1944 copy_block17(full, src, 24, stride, 17);\
1945 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1949 }\
1950 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1951 uint8_t halfH[272];\
1952 uint8_t halfHV[256];\
1953 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1954 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1956 }\
1957 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1958 uint8_t halfH[272];\
1959 uint8_t halfHV[256];\
1960 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1963 }\
1964 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1965 uint8_t full[24*17];\
1966 uint8_t halfH[272];\
1967 uint8_t halfV[256];\
1968 uint8_t halfHV[256];\
1969 copy_block17(full, src, 24, stride, 17);\
1970 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1972 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1974 }\
1975 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1976 uint8_t full[24*17];\
1977 uint8_t halfH[272];\
1978 copy_block17(full, src, 24, stride, 17);\
1979 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1980 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1981 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1982 }\
1983 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t full[24*17];\
1985 uint8_t halfH[272];\
1986 uint8_t halfV[256];\
1987 uint8_t halfHV[256];\
1988 copy_block17(full, src, 24, stride, 17);\
1989 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1991 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1993 }\
1994 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1995 uint8_t full[24*17];\
1996 uint8_t halfH[272];\
1997 copy_block17(full, src, 24, stride, 17);\
1998 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1999 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2000 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2001 }\
2002 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2003 uint8_t halfH[272];\
2004 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2005 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2006 }
2007
2008 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2009 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2010 #define op_put(a, b) a = cm[((b) + 16)>>5]
2011 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2012
2013 QPEL_MC(0, put_ , _ , op_put)
2014 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2015 QPEL_MC(0, avg_ , _ , op_avg)
2016 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2017 #undef op_avg
2018 #undef op_avg_no_rnd
2019 #undef op_put
2020 #undef op_put_no_rnd
2021
2022 #if 1
2023 #define H264_LOWPASS(OPNAME, OP, OP2) \
2024 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2025 const int h=4;\
2026 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2027 int i;\
2028 for(i=0; i<h; i++)\
2029 {\
2030 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2031 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2032 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2033 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2034 dst+=dstStride;\
2035 src+=srcStride;\
2036 }\
2037 }\
2038 \
2039 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2040 const int w=4;\
2041 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2042 int i;\
2043 for(i=0; i<w; i++)\
2044 {\
2045 const int srcB= src[-2*srcStride];\
2046 const int srcA= src[-1*srcStride];\
2047 const int src0= src[0 *srcStride];\
2048 const int src1= src[1 *srcStride];\
2049 const int src2= src[2 *srcStride];\
2050 const int src3= src[3 *srcStride];\
2051 const int src4= src[4 *srcStride];\
2052 const int src5= src[5 *srcStride];\
2053 const int src6= src[6 *srcStride];\
2054 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2055 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2056 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2057 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2058 dst++;\
2059 src++;\
2060 }\
2061 }\
2062 \
2063 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2064 const int h=4;\
2065 const int w=4;\
2066 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2067 int i;\
2068 src -= 2*srcStride;\
2069 for(i=0; i<h+5; i++)\
2070 {\
2071 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2072 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2073 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2074 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2075 tmp+=tmpStride;\
2076 src+=srcStride;\
2077 }\
2078 tmp -= tmpStride*(h+5-2);\
2079 for(i=0; i<w; i++)\
2080 {\
2081 const int tmpB= tmp[-2*tmpStride];\
2082 const int tmpA= tmp[-1*tmpStride];\
2083 const int tmp0= tmp[0 *tmpStride];\
2084 const int tmp1= tmp[1 *tmpStride];\
2085 const int tmp2= tmp[2 *tmpStride];\
2086 const int tmp3= tmp[3 *tmpStride];\
2087 const int tmp4= tmp[4 *tmpStride];\
2088 const int tmp5= tmp[5 *tmpStride];\
2089 const int tmp6= tmp[6 *tmpStride];\
2090 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2091 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2092 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2093 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2094 dst++;\
2095 tmp++;\
2096 }\
2097 }\
2098 \
2099 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2100 const int h=8;\
2101 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2102 int i;\
2103 for(i=0; i<h; i++)\
2104 {\
2105 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2106 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2107 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2108 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2109 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2110 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2111 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2112 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2113 dst+=dstStride;\
2114 src+=srcStride;\
2115 }\
2116 }\
2117 \
2118 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2119 const int w=8;\
2120 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2121 int i;\
2122 for(i=0; i<w; i++)\
2123 {\
2124 const int srcB= src[-2*srcStride];\
2125 const int srcA= src[-1*srcStride];\
2126 const int src0= src[0 *srcStride];\
2127 const int src1= src[1 *srcStride];\
2128 const int src2= src[2 *srcStride];\
2129 const int src3= src[3 *srcStride];\
2130 const int src4= src[4 *srcStride];\
2131 const int src5= src[5 *srcStride];\
2132 const int src6= src[6 *srcStride];\
2133 const int src7= src[7 *srcStride];\
2134 const int src8= src[8 *srcStride];\
2135 const int src9= src[9 *srcStride];\
2136 const int src10=src[10*srcStride];\
2137 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2138 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2139 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2140 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2141 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2142 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2143 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2144 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2145 dst++;\
2146 src++;\
2147 }\
2148 }\
2149 \
2150 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2151 const int h=8;\
2152 const int w=8;\
2153 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2154 int i;\
2155 src -= 2*srcStride;\
2156 for(i=0; i<h+5; i++)\
2157 {\
2158 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2159 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2160 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2161 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2162 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2163 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2164 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2165 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2166 tmp+=tmpStride;\
2167 src+=srcStride;\
2168 }\
2169 tmp -= tmpStride*(h+5-2);\
2170 for(i=0; i<w; i++)\
2171 {\
2172 const int tmpB= tmp[-2*tmpStride];\
2173 const int tmpA= tmp[-1*tmpStride];\
2174 const int tmp0= tmp[0 *tmpStride];\
2175 const int tmp1= tmp[1 *tmpStride];\
2176 const int tmp2= tmp[2 *tmpStride];\
2177 const int tmp3= tmp[3 *tmpStride];\
2178 const int tmp4= tmp[4 *tmpStride];\
2179 const int tmp5= tmp[5 *tmpStride];\
2180 const int tmp6= tmp[6 *tmpStride];\
2181 const int tmp7= tmp[7 *tmpStride];\
2182 const int tmp8= tmp[8 *tmpStride];\
2183 const int tmp9= tmp[9 *tmpStride];\
2184 const int tmp10=tmp[10*tmpStride];\
2185 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2186 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2187 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2188 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2189 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2190 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2191 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2192 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2193 dst++;\
2194 tmp++;\
2195 }\
2196 }\
2197 \
2198 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2199 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2200 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2201 src += 8*srcStride;\
2202 dst += 8*dstStride;\
2203 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2204 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2205 }\
2206 \
2207 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2208 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2209 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2210 src += 8*srcStride;\
2211 dst += 8*dstStride;\
2212 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2213 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2214 }\
2215 \
2216 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2217 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2218 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2219 src += 8*srcStride;\
2220 dst += 8*dstStride;\
2221 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2222 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2223 }\
2224
2225 #define H264_MC(OPNAME, SIZE) \
2226 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2227 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2228 }\
2229 \
2230 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2231 uint8_t half[SIZE*SIZE];\
2232 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2233 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2234 }\
2235 \
2236 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2237 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2238 }\
2239 \
2240 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2241 uint8_t half[SIZE*SIZE];\
2242 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2243 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2244 }\
2245 \
2246 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2247 uint8_t full[SIZE*(SIZE+5)];\
2248 uint8_t * const full_mid= full + SIZE*2;\
2249 uint8_t half[SIZE*SIZE];\
2250 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2251 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2252 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2253 }\
2254 \
2255 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2256 uint8_t full[SIZE*(SIZE+5)];\
2257 uint8_t * const full_mid= full + SIZE*2;\
2258 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2259 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2260 }\
2261 \
2262 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2263 uint8_t full[SIZE*(SIZE+5)];\
2264 uint8_t * const full_mid= full + SIZE*2;\
2265 uint8_t half[SIZE*SIZE];\
2266 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2267 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2268 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2269 }\
2270 \
2271 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2272 uint8_t full[SIZE*(SIZE+5)];\
2273 uint8_t * const full_mid= full + SIZE*2;\
2274 uint8_t halfH[SIZE*SIZE];\
2275 uint8_t halfV[SIZE*SIZE];\
2276 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2277 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2278 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2279 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2280 }\
2281 \
2282 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2283 uint8_t full[SIZE*(SIZE+5)];\
2284 uint8_t * const full_mid= full + SIZE*2;\
2285 uint8_t halfH[SIZE*SIZE];\
2286 uint8_t halfV[SIZE*SIZE];\
2287 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2288 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2289 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2290 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2291 }\
2292 \
2293 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2294 uint8_t full[SIZE*(SIZE+5)];\
2295 uint8_t * const full_mid= full + SIZE*2;\
2296 uint8_t halfH[SIZE*SIZE];\
2297 uint8_t halfV[SIZE*SIZE];\
2298 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2299 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2300 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2301 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2302 }\
2303 \
2304 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2305 uint8_t full[SIZE*(SIZE+5)];\
2306 uint8_t * const full_mid= full + SIZE*2;\
2307 uint8_t halfH[SIZE*SIZE];\
2308 uint8_t halfV[SIZE*SIZE];\
2309 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2310 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2311 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2312 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2313 }\
2314 \
2315 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2316 int16_t tmp[SIZE*(SIZE+5)];\
2317 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2318 }\
2319 \
2320 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2321 int16_t tmp[SIZE*(SIZE+5)];\
2322 uint8_t halfH[SIZE*SIZE];\
2323 uint8_t halfHV[SIZE*SIZE];\
2324 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2325 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2326 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2327 }\
2328 \
2329 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2330 int16_t tmp[SIZE*(SIZE+5)];\
2331 uint8_t halfH[SIZE*SIZE];\
2332 uint8_t halfHV[SIZE*SIZE];\
2333 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2334 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2335 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2336 }\
2337 \
2338 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2339 uint8_t full[SIZE*(SIZE+5)];\
2340 uint8_t * const full_mid= full + SIZE*2;\
2341 int16_t tmp[SIZE*(SIZE+5)];\
2342 uint8_t halfV[SIZE*SIZE];\
2343 uint8_t halfHV[SIZE*SIZE];\
2344 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2345 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2346 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2347 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2348 }\
2349 \
2350 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2351 uint8_t full[SIZE*(SIZE+5)];\
2352 uint8_t * const full_mid= full + SIZE*2;\
2353 int16_t tmp[SIZE*(SIZE+5)];\
2354 uint8_t halfV[SIZE*SIZE];\
2355 uint8_t halfHV[SIZE*SIZE];\
2356 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2357 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2358 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2359 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2360 }\
2361
2362 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2363 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2364 #define op_put(a, b) a = cm[((b) + 16)>>5]
2365 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2366 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2367
2368 H264_LOWPASS(put_ , op_put, op2_put)
2369 H264_LOWPASS(avg_ , op_avg, op2_avg)
2370 H264_MC(put_, 4)
2371 H264_MC(put_, 8)
2372 H264_MC(put_, 16)
2373 H264_MC(avg_, 4)
2374 H264_MC(avg_, 8)
2375 H264_MC(avg_, 16)
2376
2377 #undef op_avg
2378 #undef op_put
2379 #undef op2_avg
2380 #undef op2_put
2381 #endif
2382
2383 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2384 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2385 #define H264_WEIGHT(W,H) \
2386 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2387 int attribute_unused x, y; \
2388 offset <<= log2_denom; \
2389 if(log2_denom) offset += 1<<(log2_denom-1); \
2390 for(y=0; y<H; y++, block += stride){ \
2391 op_scale1(0); \
2392 op_scale1(1); \
2393 if(W==2) continue; \
2394 op_scale1(2); \
2395 op_scale1(3); \
2396 if(W==4) continue; \
2397 op_scale1(4); \
2398 op_scale1(5); \
2399 op_scale1(6); \
2400 op_scale1(7); \
2401 if(W==8) continue; \
2402 op_scale1(8); \
2403 op_scale1(9); \
2404 op_scale1(10); \
2405 op_scale1(11); \
2406 op_scale1(12); \
2407 op_scale1(13); \
2408 op_scale1(14); \
2409 op_scale1(15); \
2410 } \
2411 } \
2412 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2413 int attribute_unused x, y; \
2414 int offset = (offsets + offsetd + 1) >> 1; \
2415 offset = ((offset << 1) + 1) << log2_denom; \
2416 for(y=0; y<H; y++, dst += stride, src += stride){ \
2417 op_scale2(0); \
2418 op_scale2(1); \
2419 if(W==2) continue; \
2420 op_scale2(2); \
2421 op_scale2(3); \
2422 if(W==4) continue; \
2423 op_scale2(4); \
2424 op_scale2(5); \
2425 op_scale2(6); \
2426 op_scale2(7); \
2427 if(W==8) continue; \
2428 op_scale2(8); \
2429 op_scale2(9); \
2430 op_scale2(10); \
2431 op_scale2(11); \
2432 op_scale2(12); \
2433 op_scale2(13); \
2434 op_scale2(14); \
2435 op_scale2(15); \
2436 } \
2437 }
2438
2439 H264_WEIGHT(16,16)
2440 H264_WEIGHT(16,8)
2441 H264_WEIGHT(8,16)
2442 H264_WEIGHT(8,8)
2443 H264_WEIGHT(8,4)
2444 H264_WEIGHT(4,8)
2445 H264_WEIGHT(4,4)
2446 H264_WEIGHT(4,2)
2447 H264_WEIGHT(2,4)
2448 H264_WEIGHT(2,2)
2449
2450 #undef op_scale1
2451 #undef op_scale2
2452 #undef H264_WEIGHT
2453
2454 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2455 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2456 int i;
2457
2458 for(i=0; i<h; i++){
2459 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2460 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2461 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2462 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2463 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2464 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2465 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2466 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2467 dst+=dstStride;
2468 src+=srcStride;
2469 }
2470 }
2471
2472 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2473 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2474 int i;
2475
2476 for(i=0; i<w; i++){
2477 const int src_1= src[ -srcStride];
2478 const int src0 = src[0 ];
2479 const int src1 = src[ srcStride];
2480 const int src2 = src[2*srcStride];
2481 const int src3 = src[3*srcStride];
2482 const int src4 = src[4*srcStride];
2483 const int src5 = src[5*srcStride];
2484 const int src6 = src[6*srcStride];
2485 const int src7 = src[7*srcStride];
2486 const int src8 = src[8*srcStride];
2487 const int src9 = src[9*srcStride];
2488 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2489 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2490 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2491 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2492 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2493 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2494 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2495 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2496 src++;
2497 dst++;
2498 }
2499 }
2500
2501 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2502 put_pixels8_c(dst, src, stride, 8);
2503 }
2504
2505 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2506 uint8_t half[64];
2507 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2508 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2509 }
2510
2511 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2512 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2513 }
2514
2515 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2516 uint8_t half[64];
2517 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2518 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2519 }
2520
2521 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2522 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2523 }
2524
2525 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2526 uint8_t halfH[88];
2527 uint8_t halfV[64];
2528 uint8_t halfHV[64];
2529 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2530 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2531 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2532 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2533 }
2534 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2535 uint8_t halfH[88];
2536 uint8_t halfV[64];
2537 uint8_t halfHV[64];
2538 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2539 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2540 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2541 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2542 }
2543 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2544 uint8_t halfH[88];
2545 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2546 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2547 }
2548
2549 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2550 int x;
2551 const int strength= ff_h263_loop_filter_strength[qscale];
2552
2553 for(x=0; x<8; x++){
2554 int d1, d2, ad1;
2555 int p0= src[x-2*stride];
2556 int p1= src[x-1*stride];
2557 int p2= src[x+0*stride];
2558 int p3= src[x+1*stride];
2559 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2560
2561 if (d<-2*strength) d1= 0;
2562 else if(d<- strength) d1=-2*strength - d;
2563 else if(d< strength) d1= d;
2564 else if(d< 2*strength) d1= 2*strength - d;
2565 else d1= 0;
2566
2567 p1 += d1;
2568 p2 -= d1;
2569 if(p1&256) p1= ~(p1>>31);
2570 if(p2&256) p2= ~(p2>>31);
2571
2572 src[x-1*stride] = p1;
2573 src[x+0*stride] = p2;
2574
2575 ad1= ABS(d1)>>1;
2576
2577 d2= clip((p0-p3)/4, -ad1, ad1);
2578
2579 src[x-2*stride] = p0 - d2;
2580 src[x+ stride] = p3 + d2;
2581 }
2582 }
2583
2584 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2585 int y;
2586 const int strength= ff_h263_loop_filter_strength[qscale];
2587
2588 for(y=0; y<8; y++){
2589 int d1, d2, ad1;
2590 int p0= src[y*stride-2];
2591 int p1= src[y*stride-1];
2592 int p2= src[y*stride+0];
2593 int p3= src[y*stride+1];
2594 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2595
2596 if (d<-2*strength) d1= 0;
2597 else if(d<- strength) d1=-2*strength - d;
2598 else if(d< strength) d1= d;
2599 else if(d< 2*strength) d1= 2*strength - d;
2600 else d1= 0;
2601
2602 p1 += d1;
2603 p2 -= d1;
2604 if(p1&256) p1= ~(p1>>31);
2605 if(p2&256) p2= ~(p2>>31);
2606
2607 src[y*stride-1] = p1;
2608 src[y*stride+0] = p2;
2609
2610 ad1= ABS(d1)>>1;
2611
2612 d2= clip((p0-p3)/4, -ad1, ad1);
2613
2614 src[y*stride-2] = p0 - d2;
2615 src[y*stride+1] = p3 + d2;
2616 }
2617 }
2618
2619 static void h261_loop_filter_c(uint8_t *src, int stride){
2620 int x,y,xy,yz;
2621 int temp[64];
2622
2623 for(x=0; x<8; x++){
2624 temp[x ] = 4*src[x ];
2625 temp[x + 7*8] = 4*src[x + 7*stride];
2626 }
2627 for(y=1; y<7; y++){
2628 for(x=0; x<8; x++){
2629 xy = y * stride + x;
2630 yz = y * 8 + x;
2631 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2632 }
2633 }
2634
2635 for(y=0; y<8; y++){
2636 src[ y*stride] = (temp[ y*8] + 2)>>2;
2637 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2638 for(x=1; x<7; x++){
2639 xy = y * stride + x;
2640 yz = y * 8 + x;
2641 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2642 }
2643 }
2644 }
2645
2646 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2647 {
2648 int i, d;
2649 for( i = 0; i < 4; i++ ) {
2650 if( tc0[i] < 0 ) {
2651 pix += 4*ystride;
2652 continue;
2653 }
2654 for( d = 0; d < 4; d++ ) {
2655 const int p0 = pix[-1*xstride];
2656 const int p1 = pix[-2*xstride];
2657 const int p2 = pix[-3*xstride];
2658 const int q0 = pix[0];
2659 const int q1 = pix[1*xstride];
2660 const int q2 = pix[2*xstride];
2661
2662 if( ABS( p0 - q0 ) < alpha &&
2663 ABS( p1 - p0 ) < beta &&
2664 ABS( q1 - q0 ) < beta ) {
2665
2666 int tc = tc0[i];
2667 int i_delta;
2668
2669 if( ABS( p2 - p0 ) < beta ) {
2670 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2671 tc++;
2672 }
2673 if( ABS( q2 - q0 ) < beta ) {
2674 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2675 tc++;
2676 }
2677
2678 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2679 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2680 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2681 }
2682 pix += ystride;
2683 }
2684 }
2685 }
2686 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2687 {
2688 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2689 }
2690 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2691 {
2692 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2693 }
2694
2695 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2696 {
2697 int i, d;
2698 for( i = 0; i < 4; i++ ) {
2699 const int tc = tc0[i];
2700 if( tc <= 0 ) {
2701 pix += 2*ystride;
2702 continue;
2703 }
2704 for( d = 0; d < 2; d++ ) {
2705 const int p0 = pix[-1*xstride];
2706 const int p1 = pix[-2*xstride];
2707 const int q0 = pix[0];
2708 const int q1 = pix[1*xstride];
2709
2710 if( ABS( p0 - q0 ) < alpha &&
2711 ABS( p1 - p0 ) < beta &&
2712 ABS( q1 - q0 ) < beta ) {
2713
2714 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2715
2716 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2717 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2718 }
2719 pix += ystride;
2720 }
2721 }
2722 }
2723 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2724 {
2725 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2726 }
2727 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2728 {
2729 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2730 }
2731
2732 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2733 {
2734 int d;
2735 for( d = 0; d < 8; d++ ) {
2736 const int p0 = pix[-1*xstride];
2737 const int p1 = pix[-2*xstride];
2738 const int q0 = pix[0];
2739 const int q1 = pix[1*xstride];
2740
2741 if( ABS( p0 - q0 ) < alpha &&
2742 ABS( p1 - p0 ) < beta &&
2743 ABS( q1 - q0 ) < beta ) {
2744
2745 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2746 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2747 }
2748 pix += ystride;
2749 }
2750 }
2751 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2752 {
2753 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2754 }
2755 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2756 {
2757 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2758 }
2759
2760 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2761 {
2762 int s, i;
2763
2764 s = 0;
2765 for(i=0;i<h;i++) {
2766 s += abs(pix1[0] - pix2[0]);
2767 s += abs(pix1[1] - pix2[1]);
2768 s += abs(pix1[2] - pix2[2]);
2769 s += abs(pix1[3] - pix2[3]);
2770 s += abs(pix1[4] - pix2[4]);
2771 s += abs(pix1[5] - pix2[5]);
2772 s += abs(pix1[6] - pix2[6]);
2773 s += abs(pix1[7] - pix2[7]);
2774 s += abs(pix1[8] - pix2[8]);
2775 s += abs(pix1[9] - pix2[9]);
2776 s += abs(pix1[10] - pix2[10]);
2777 s += abs(pix1[11] - pix2[11]);
2778 s += abs(pix1[12] - pix2[12]);
2779 s += abs(pix1[13] - pix2[13]);
2780 s += abs(pix1[14] - pix2[14]);
2781 s += abs(pix1[15] - pix2[15]);
2782 pix1 += line_size;
2783 pix2 += line_size;
2784 }
2785 return s;
2786 }
2787
2788 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2789 {
2790 int s, i;
2791
2792 s = 0;
2793 for(i=0;i<h;i++) {
2794 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2795 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2796 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2797 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2798 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2799 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2800 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2801 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2802 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2803 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2804 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2805 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2806 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2807 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2808 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2809 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2810 pix1 += line_size;
2811 pix2 += line_size;
2812 }
2813 return s;
2814 }
2815
2816 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2817 {
2818 int s, i;
2819 uint8_t *pix3 = pix2 + line_size;
2820
2821 s = 0;
2822 for(i=0;i<h;i++) {
2823 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2824 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2825 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2826 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2827 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2828 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2829 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2830 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2831 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2832 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2833 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2834 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2835 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2836 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2837 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2838 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2839 pix1 += line_size;
2840 pix2 += line_size;
2841 pix3 += line_size;
2842 }
2843 return s;
2844 }
2845
2846 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2847 {
2848 int s, i;
2849 uint8_t *pix3 = pix2 + line_size;
2850
2851 s = 0;
2852 for(i=0;i<h;i++) {
2853 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2854 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2855 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2856 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2857 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2858 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2859 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2860 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2861 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2862 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2863 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2864 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2865 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2866 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2867 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2868 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2869 pix1 += line_size;
2870 pix2 += line_size;
2871 pix3 += line_size;
2872 }
2873 return s;
2874 }
2875
2876 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2877 {
2878 int s, i;
2879
2880 s = 0;
2881 for(i=0;i<h;i++) {
2882 s += abs(pix1[0] - pix2[0]);
2883 s += abs(pix1[1] - pix2[1]);
2884 s += abs(pix1[2] - pix2[2]);
2885 s += abs(pix1[3] - pix2[3]);
2886 s += abs(pix1[4] - pix2[4]);
2887 s += abs(pix1[5] - pix2[5]);
2888 s += abs(pix1[6] - pix2[6]);
2889 s += abs(pix1[7] - pix2[7]);
2890 pix1 += line_size;
2891 pix2 += line_size;
2892 }
2893 return s;
2894 }
2895
2896 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2897 {
2898 int s, i;
2899
2900 s = 0;
2901 for(i=0;i<h;i++) {
2902 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2903 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2904 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2905 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2906 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2907 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2908 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2909 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2910 pix1 += line_size;
2911 pix2 += line_size;
2912 }
2913 return s;
2914 }
2915
2916 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2917 {
2918 int s, i;
2919 uint8_t *pix3 = pix2 + line_size;
2920
2921 s = 0;
2922 for(i=0;i<h;i++) {
2923 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2924 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2925 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2926 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2927 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2928 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2929 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2930 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2931 pix1 += line_size;
2932 pix2 += line_size;
2933 pix3 += line_size;
2934 }
2935 return s;
2936 }
2937
2938 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2939 {
2940 int s, i;
2941 uint8_t *pix3 = pix2 + line_size;
2942
2943 s = 0;
2944 for(i=0;i<h;i++) {
2945 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2946 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2947 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2948 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2949 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2950 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2951 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2952 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2953 pix1 += line_size;
2954 pix2 += line_size;
2955 pix3 += line_size;
2956 }
2957 return s;
2958 }
2959
2960 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2961 int score1=0;
2962 int score2=0;
2963 int x,y;
2964
2965 for(y=0; y<h; y++){
2966 for(x=0; x<16; x++){
2967 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2968 }
2969 if(y+1<h){
2970 for(x=0; x<15; x++){
2971 score2+= ABS( s1[x ] - s1[x +stride]
2972 - s1[x+1] + s1[x+1+stride])
2973 -ABS( s2[x ] - s2[x +stride]
2974 - s2[x+1] + s2[x+1+stride]);
2975 }
2976 }
2977 s1+= stride;
2978 s2+= stride;
2979 }
2980
2981 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2982 else return score1 + ABS(score2)*8;
2983 }
2984
2985 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2986 int score1=0;
2987 int score2=0;
2988 int x,y;
2989
2990 for(y=0; y<h; y++){
2991 for(x=0; x<8; x++){
2992 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2993 }
2994 if(y+1<h){
2995 for(x=0; x<7; x++){
2996 score2+= ABS( s1[x ] - s1[x +stride]
2997 - s1[x+1] + s1[x+1+stride])
2998 -ABS( s2[x ] - s2[x +stride]
2999 - s2[x+1] + s2[x+1+stride]);
3000 }
3001 }
3002 s1+= stride;
3003 s2+= stride;
3004 }
3005
3006 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3007 else return score1 + ABS(score2)*8;
3008 }
3009
3010 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3011 int i;
3012 unsigned int sum=0;
3013
3014 for(i=0; i<8*8; i++){
3015 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3016 int w= weight[i];
3017 b>>= RECON_SHIFT;
3018 assert(-512<b && b<512);
3019
3020 sum += (w*b)*(w*b)>>4;
3021 }
3022 return sum>>2;
3023 }
3024
3025 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3026 int i;
3027
3028 for(i=0; i<8*8; i++){
3029 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3030 }
3031 }
3032
3033 /**
3034 * permutes an 8x8 block.
3035 * @param block the block which will be permuted according to the given permutation vector
3036 * @param permutation the permutation vector
3037 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3038 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3039 * (inverse) permutated to scantable order!
3040 */
3041 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3042 {
3043 int i;
3044 DCTELEM temp[64];
3045
3046 if(last<=0) return;
3047 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3048
3049 for(i=0; i<=last; i++){
3050 const int j= scantable[i];
3051 temp[j]= block[j];
3052 block[j]=0;
3053 }
3054
3055 for(i=0; i<=last; i++){
3056 const int j= scantable[i];
3057 const int perm_j= permutation[j];
3058 block[perm_j]= temp[j];
3059 }
3060 }
3061
3062 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3063 return 0;
3064 }
3065
3066 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3067 int i;
3068
3069 memset(cmp, 0, sizeof(void*)*5);
3070
3071 for(i=0; i<5; i++){
3072 switch(type&0xFF){
3073 case FF_CMP_SAD:
3074 cmp[i]= c->sad[i];
3075 break;
3076 case FF_CMP_SATD:
3077 cmp[i]= c->hadamard8_diff[i];
3078 break;
3079 case FF_CMP_SSE:
3080 cmp[i]= c->sse[i];
3081 break;
3082 case FF_CMP_DCT:
3083 cmp[i]= c->dct_sad[i];
3084 break;
3085 case FF_CMP_DCTMAX:
3086 cmp[i]= c->dct_max[i];
3087 break;
3088 case FF_CMP_PSNR:
3089 cmp[i]= c->quant_psnr[i];
3090 break;
3091 case FF_CMP_BIT:
3092 cmp[i]= c->bit[i];
3093 break;
3094 case FF_CMP_RD:
3095 cmp[i]= c->rd[i];
3096 break;
3097 case FF_CMP_VSAD:
3098 cmp[i]= c->vsad[i];
3099 break;
3100 case FF_CMP_VSSE:
3101 cmp[i]= c->vsse[i];
3102 break;
3103 case FF_CMP_ZERO:
3104 cmp[i]= zero_cmp;
3105 break;
3106 case FF_CMP_NSSE:
3107 cmp[i]= c->nsse[i];
3108 break;
3109 case FF_CMP_W53:
3110 cmp[i]= c->w53[i];
3111 break;
3112 case FF_CMP_W97:
3113 cmp[i]= c->w97[i];
3114 break;
3115 default:
3116 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3117 }
3118 }
3119 }
3120
3121 /**
3122 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3123 */
3124 static void clear_blocks_c(DCTELEM *blocks)
3125 {
3126 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3127 }
3128
3129 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3130 int i;
3131 for(i=0; i+7<w; i+=8){
3132 dst[i+0] += src[i+0];
3133 dst[i+1] += src[i+1];
3134 dst[i+2] += src[i+2];
3135 dst[i+3] += src[i+3];
3136 dst[i+4] += src[i+4];
3137 dst[i+5] += src[i+5];
3138 dst[i+6] += src[i+6];
3139 dst[i+7] += src[i+7];
3140 }
3141 for(; i<w; i++)
3142 dst[i+0] += src[i+0];
3143 }
3144
3145 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3146 int i;
3147 for(i=0; i+7<w; i+=8){
3148 dst[i+0] = src1[i+0]-src2[i+0];
3149 dst[i+1] = src1[i+1]-src2[i+1];
3150 dst[i+2] = src1[i+2]-src2[i+2];
3151 dst[i+3] = src1[i+3]-src2[i+3];
3152 dst[i+4] = src1[i+4]-src2[i+4];
3153 dst[i+5] = src1[i+5]-src2[i+5];
3154 dst[i+6] = src1[i+6]-src2[i+6];
3155 dst[i+7] = src1[i+7]-src2[i+7];
3156 }
3157 for(; i<w; i++)
3158 dst[i+0] = src1[i+0]-src2[i+0];
3159 }
3160
3161 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3162 int i;
3163 uint8_t l, lt;
3164
3165 l= *left;
3166 lt= *left_top;
3167
3168 for(i=0; i<w; i++){
3169 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3170 lt= src1[i];
3171 l= src2[i];
3172 dst[i]= l - pred;
3173 }
3174
3175 *left= l;
3176 *left_top= lt;
3177 }
3178
3179 #define BUTTERFLY2(o1,o2,i1,i2) \
3180 o1= (i1)+(i2);\
3181 o2= (i1)-(i2);
3182
3183 #define BUTTERFLY1(x,y) \
3184 {\
3185 int a,b;\
3186 a= x;\
3187 b= y;\
3188 x= a+b;\
3189 y= a-b;\
3190 }
3191
3192 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3193
3194 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3195 int i;
3196 int temp[64];
3197 int sum=0;
3198
3199 assert(h==8);
3200
3201 for(i=0; i<8; i++){
3202 //FIXME try pointer walks
3203 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3204 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3205 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3206 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3207
3208 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3209 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3210 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3211 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3212
3213 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3214 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3215 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3216 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3217 }
3218
3219 for(i=0; i<8; i++){
3220 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3221 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3222 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3223 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3224
3225 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3226 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3227 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3228 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3229
3230 sum +=
3231 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3232 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3233 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3234 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3235 }
3236 #if 0
3237 static int maxi=0;
3238 if(sum>maxi){
3239 maxi=sum;
3240 printf("MAX:%d\n", maxi);
3241 }
3242 #endif
3243 return sum;
3244 }
3245
3246 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3247 int i;
3248 int temp[64];
3249 int sum=0;
3250
3251 assert(h==8);
3252
3253 for(i=0; i<8; i++){
3254 //FIXME try pointer walks
3255 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3256 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3257 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3258 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3259
3260 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3261 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3262 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3263 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3264
3265 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3266 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3267 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3268 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3269 }
3270
3271 for(i=0; i<8; i++){
3272 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3273 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3274 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3275 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3276
3277 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3278 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3279 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3280 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3281
3282 sum +=
3283 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3284 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3285 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3286 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3287 }
3288
3289 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3290
3291 return sum;
3292 }
3293
3294 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3295 MpegEncContext * const s= (MpegEncContext *)c;
3296 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3297 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3298 int sum=0, i;
3299
3300 assert(h==8);
3301
3302 s->dsp.diff_pixels(temp, src1, src2, stride);
3303 s->dsp.fdct(temp);
3304
3305 for(i=0; i<64; i++)
3306 sum+= ABS(temp[i]);
3307
3308 return sum;
3309 }
3310
3311 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3312 MpegEncContext * const s= (MpegEncContext *)c;
3313 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3314 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3315 int sum=0, i;
3316
3317 assert(h==8);
3318
3319 s->dsp.diff_pixels(temp, src1, src2, stride);
3320 s->dsp.fdct(temp);
3321
3322 for(i=0; i<64; i++)
3323 sum= FFMAX(sum, ABS(temp[i]));
3324
3325 return sum;
3326 }
3327
3328 void simple_idct(DCTELEM *block); //FIXME
3329
3330 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3331 MpegEncContext * const s= (MpegEncContext *)c;
3332 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3333 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3334 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3335 int sum=0, i;
3336
3337 assert(h==8);
3338 s->mb_intra=0;
3339
3340 s->dsp.diff_pixels(temp, src1, src2, stride);
3341
3342 memcpy(bak, temp, 64*sizeof(DCTELEM));
3343
3344 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3345 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3346 simple_idct(temp); //FIXME
3347
3348 for(i=0; i<64; i++)
3349 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3350
3351 return sum;
3352 }
3353
3354 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3355 MpegEncContext * const s= (MpegEncContext *)c;
3356 const uint8_t *scantable= s->intra_scantable.permutated;
3357 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3358 uint64_t __align8 aligned_bak[stride];
3359 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3360 uint8_t * const bak= (uint8_t*)aligned_bak;
3361 int i, last, run, bits, level, distoration, start_i;
3362 const int esc_length= s->ac_esc_length;
3363 uint8_t * length;
3364 uint8_t * last_length;
3365
3366 assert(h==8);
3367
3368 for(i=0; i<8; i++){
3369 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3370 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3371 }
3372
3373 s->dsp.diff_pixels(temp, src1, src2, stride);
3374
3375 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3376
3377 bits=0;
3378
3379 if (s->mb_intra) {
3380 start_i = 1;
3381 length = s->intra_ac_vlc_length;
3382 last_length= s->intra_ac_vlc_last_length;
3383 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3384 } else {
3385 start_i = 0;
3386 length = s->inter_ac_vlc_length;
3387 last_length= s->inter_ac_vlc_last_length;
3388 }
3389
3390 if(last>=start_i){
3391 run=0;
3392 for(i=start_i; i<last; i++){
3393 int j= scantable[i];
3394 level= temp[j];
3395
3396 if(level){
3397 level+=64;
3398 if((level&(~127)) == 0){
3399 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3400 }else
3401 bits+= esc_length;
3402 run=0;
3403 }else
3404 run++;
3405 }
3406 i= scantable[last];
3407
3408 level= temp[i] + 64;
3409
3410 assert(level - 64);
3411
3412 if((level&(~127)) == 0){
3413 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3414 }else
3415 bits+= esc_length;
3416
3417 }
3418
3419 if(last>=0){
3420 if(s->mb_intra)
3421 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3422 else