4387e60098fc08629846c7ea0feb81e804f6456e
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /**
26 * @file dsputil.c
27 * DSP utils
28 */
29
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
34 #include "faandct.h"
35 #include "h263.h"
36 #include "snow.h"
37
38 /* snow.c */
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
40
41 /* vorbis.c */
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43
44 /* flacenc.c */
45 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
46
47 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
48 uint32_t ff_squareTbl[512] = {0, };
49
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
59 };
60
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
72 };
73
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
76
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
86 };
87
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
97 };
98
99 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
100 const uint32_t ff_inverse[256]={
101 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
102 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
103 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
104 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
105 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
106 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
107 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
108 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
109 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
110 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
111 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
112 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
113 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
114 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
115 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
116 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
117 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
118 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
119 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
120 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
121 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
122 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
123 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
124 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
125 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
126 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
127 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
128 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
129 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
130 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
131 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
132 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
133 };
134
135 /* Input permutation for the simple_idct_mmx */
136 static const uint8_t simple_mmx_permutation[64]={
137 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
138 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
139 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
140 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
141 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
142 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
143 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
144 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
145 };
146
147 static int pix_sum_c(uint8_t * pix, int line_size)
148 {
149 int s, i, j;
150
151 s = 0;
152 for (i = 0; i < 16; i++) {
153 for (j = 0; j < 16; j += 8) {
154 s += pix[0];
155 s += pix[1];
156 s += pix[2];
157 s += pix[3];
158 s += pix[4];
159 s += pix[5];
160 s += pix[6];
161 s += pix[7];
162 pix += 8;
163 }
164 pix += line_size - 16;
165 }
166 return s;
167 }
168
169 static int pix_norm1_c(uint8_t * pix, int line_size)
170 {
171 int s, i, j;
172 uint32_t *sq = ff_squareTbl + 256;
173
174 s = 0;
175 for (i = 0; i < 16; i++) {
176 for (j = 0; j < 16; j += 8) {
177 #if 0
178 s += sq[pix[0]];
179 s += sq[pix[1]];
180 s += sq[pix[2]];
181 s += sq[pix[3]];
182 s += sq[pix[4]];
183 s += sq[pix[5]];
184 s += sq[pix[6]];
185 s += sq[pix[7]];
186 #else
187 #if LONG_MAX > 2147483647
188 register uint64_t x=*(uint64_t*)pix;
189 s += sq[x&0xff];
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 s += sq[(x>>32)&0xff];
194 s += sq[(x>>40)&0xff];
195 s += sq[(x>>48)&0xff];
196 s += sq[(x>>56)&0xff];
197 #else
198 register uint32_t x=*(uint32_t*)pix;
199 s += sq[x&0xff];
200 s += sq[(x>>8)&0xff];
201 s += sq[(x>>16)&0xff];
202 s += sq[(x>>24)&0xff];
203 x=*(uint32_t*)(pix+4);
204 s += sq[x&0xff];
205 s += sq[(x>>8)&0xff];
206 s += sq[(x>>16)&0xff];
207 s += sq[(x>>24)&0xff];
208 #endif
209 #endif
210 pix += 8;
211 }
212 pix += line_size - 16;
213 }
214 return s;
215 }
216
217 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
218 int i;
219
220 for(i=0; i+8<=w; i+=8){
221 dst[i+0]= bswap_32(src[i+0]);
222 dst[i+1]= bswap_32(src[i+1]);
223 dst[i+2]= bswap_32(src[i+2]);
224 dst[i+3]= bswap_32(src[i+3]);
225 dst[i+4]= bswap_32(src[i+4]);
226 dst[i+5]= bswap_32(src[i+5]);
227 dst[i+6]= bswap_32(src[i+6]);
228 dst[i+7]= bswap_32(src[i+7]);
229 }
230 for(;i<w; i++){
231 dst[i+0]= bswap_32(src[i+0]);
232 }
233 }
234
235 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
236 {
237 int s, i;
238 uint32_t *sq = ff_squareTbl + 256;
239
240 s = 0;
241 for (i = 0; i < h; i++) {
242 s += sq[pix1[0] - pix2[0]];
243 s += sq[pix1[1] - pix2[1]];
244 s += sq[pix1[2] - pix2[2]];
245 s += sq[pix1[3] - pix2[3]];
246 pix1 += line_size;
247 pix2 += line_size;
248 }
249 return s;
250 }
251
252 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
253 {
254 int s, i;
255 uint32_t *sq = ff_squareTbl + 256;
256
257 s = 0;
258 for (i = 0; i < h; i++) {
259 s += sq[pix1[0] - pix2[0]];
260 s += sq[pix1[1] - pix2[1]];
261 s += sq[pix1[2] - pix2[2]];
262 s += sq[pix1[3] - pix2[3]];
263 s += sq[pix1[4] - pix2[4]];
264 s += sq[pix1[5] - pix2[5]];
265 s += sq[pix1[6] - pix2[6]];
266 s += sq[pix1[7] - pix2[7]];
267 pix1 += line_size;
268 pix2 += line_size;
269 }
270 return s;
271 }
272
273 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
274 {
275 int s, i;
276 uint32_t *sq = ff_squareTbl + 256;
277
278 s = 0;
279 for (i = 0; i < h; i++) {
280 s += sq[pix1[ 0] - pix2[ 0]];
281 s += sq[pix1[ 1] - pix2[ 1]];
282 s += sq[pix1[ 2] - pix2[ 2]];
283 s += sq[pix1[ 3] - pix2[ 3]];
284 s += sq[pix1[ 4] - pix2[ 4]];
285 s += sq[pix1[ 5] - pix2[ 5]];
286 s += sq[pix1[ 6] - pix2[ 6]];
287 s += sq[pix1[ 7] - pix2[ 7]];
288 s += sq[pix1[ 8] - pix2[ 8]];
289 s += sq[pix1[ 9] - pix2[ 9]];
290 s += sq[pix1[10] - pix2[10]];
291 s += sq[pix1[11] - pix2[11]];
292 s += sq[pix1[12] - pix2[12]];
293 s += sq[pix1[13] - pix2[13]];
294 s += sq[pix1[14] - pix2[14]];
295 s += sq[pix1[15] - pix2[15]];
296
297 pix1 += line_size;
298 pix2 += line_size;
299 }
300 return s;
301 }
302
303
304 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
305 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
306 int s, i, j;
307 const int dec_count= w==8 ? 3 : 4;
308 int tmp[32*32];
309 int level, ori;
310 static const int scale[2][2][4][4]={
311 {
312 {
313 // 9/7 8x8 dec=3
314 {268, 239, 239, 213},
315 { 0, 224, 224, 152},
316 { 0, 135, 135, 110},
317 },{
318 // 9/7 16x16 or 32x32 dec=4
319 {344, 310, 310, 280},
320 { 0, 320, 320, 228},
321 { 0, 175, 175, 136},
322 { 0, 129, 129, 102},
323 }
324 },{
325 {
326 // 5/3 8x8 dec=3
327 {275, 245, 245, 218},
328 { 0, 230, 230, 156},
329 { 0, 138, 138, 113},
330 },{
331 // 5/3 16x16 or 32x32 dec=4
332 {352, 317, 317, 286},
333 { 0, 328, 328, 233},
334 { 0, 180, 180, 140},
335 { 0, 132, 132, 105},
336 }
337 }
338 };
339
340 for (i = 0; i < h; i++) {
341 for (j = 0; j < w; j+=4) {
342 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
343 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
344 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
345 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
346 }
347 pix1 += line_size;
348 pix2 += line_size;
349 }
350
351 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
352
353 s=0;
354 assert(w==h);
355 for(level=0; level<dec_count; level++){
356 for(ori= level ? 1 : 0; ori<4; ori++){
357 int size= w>>(dec_count-level);
358 int sx= (ori&1) ? size : 0;
359 int stride= 32<<(dec_count-level);
360 int sy= (ori&2) ? stride>>1 : 0;
361
362 for(i=0; i<size; i++){
363 for(j=0; j<size; j++){
364 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
365 s += FFABS(v);
366 }
367 }
368 }
369 }
370 assert(s>=0);
371 return s>>9;
372 }
373
374 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 1);
376 }
377
378 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 0);
380 }
381
382 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 1);
384 }
385
386 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 0);
388 }
389
390 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 32, h, 1);
392 }
393
394 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
395 return w_c(v, pix1, pix2, line_size, 32, h, 0);
396 }
397 #endif
398
399 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
400 {
401 int i;
402
403 /* read the pixels */
404 for(i=0;i<8;i++) {
405 block[0] = pixels[0];
406 block[1] = pixels[1];
407 block[2] = pixels[2];
408 block[3] = pixels[3];
409 block[4] = pixels[4];
410 block[5] = pixels[5];
411 block[6] = pixels[6];
412 block[7] = pixels[7];
413 pixels += line_size;
414 block += 8;
415 }
416 }
417
418 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
419 const uint8_t *s2, int stride){
420 int i;
421
422 /* read the pixels */
423 for(i=0;i<8;i++) {
424 block[0] = s1[0] - s2[0];
425 block[1] = s1[1] - s2[1];
426 block[2] = s1[2] - s2[2];
427 block[3] = s1[3] - s2[3];
428 block[4] = s1[4] - s2[4];
429 block[5] = s1[5] - s2[5];
430 block[6] = s1[6] - s2[6];
431 block[7] = s1[7] - s2[7];
432 s1 += stride;
433 s2 += stride;
434 block += 8;
435 }
436 }
437
438
439 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
440 int line_size)
441 {
442 int i;
443 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
444
445 /* read the pixels */
446 for(i=0;i<8;i++) {
447 pixels[0] = cm[block[0]];
448 pixels[1] = cm[block[1]];
449 pixels[2] = cm[block[2]];
450 pixels[3] = cm[block[3]];
451 pixels[4] = cm[block[4]];
452 pixels[5] = cm[block[5]];
453 pixels[6] = cm[block[6]];
454 pixels[7] = cm[block[7]];
455
456 pixels += line_size;
457 block += 8;
458 }
459 }
460
461 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
462 int line_size)
463 {
464 int i;
465 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
466
467 /* read the pixels */
468 for(i=0;i<4;i++) {
469 pixels[0] = cm[block[0]];
470 pixels[1] = cm[block[1]];
471 pixels[2] = cm[block[2]];
472 pixels[3] = cm[block[3]];
473
474 pixels += line_size;
475 block += 8;
476 }
477 }
478
479 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
480 int line_size)
481 {
482 int i;
483 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
484
485 /* read the pixels */
486 for(i=0;i<2;i++) {
487 pixels[0] = cm[block[0]];
488 pixels[1] = cm[block[1]];
489
490 pixels += line_size;
491 block += 8;
492 }
493 }
494
495 static void put_signed_pixels_clamped_c(const DCTELEM *block,
496 uint8_t *restrict pixels,
497 int line_size)
498 {
499 int i, j;
500
501 for (i = 0; i < 8; i++) {
502 for (j = 0; j < 8; j++) {
503 if (*block < -128)
504 *pixels = 0;
505 else if (*block > 127)
506 *pixels = 255;
507 else
508 *pixels = (uint8_t)(*block + 128);
509 block++;
510 pixels++;
511 }
512 pixels += (line_size - 8);
513 }
514 }
515
516 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
517 int line_size)
518 {
519 int i;
520 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
521
522 /* read the pixels */
523 for(i=0;i<8;i++) {
524 pixels[0] = cm[pixels[0] + block[0]];
525 pixels[1] = cm[pixels[1] + block[1]];
526 pixels[2] = cm[pixels[2] + block[2]];
527 pixels[3] = cm[pixels[3] + block[3]];
528 pixels[4] = cm[pixels[4] + block[4]];
529 pixels[5] = cm[pixels[5] + block[5]];
530 pixels[6] = cm[pixels[6] + block[6]];
531 pixels[7] = cm[pixels[7] + block[7]];
532 pixels += line_size;
533 block += 8;
534 }
535 }
536
537 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
538 int line_size)
539 {
540 int i;
541 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
542
543 /* read the pixels */
544 for(i=0;i<4;i++) {
545 pixels[0] = cm[pixels[0] + block[0]];
546 pixels[1] = cm[pixels[1] + block[1]];
547 pixels[2] = cm[pixels[2] + block[2]];
548 pixels[3] = cm[pixels[3] + block[3]];
549 pixels += line_size;
550 block += 8;
551 }
552 }
553
554 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
555 int line_size)
556 {
557 int i;
558 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
559
560 /* read the pixels */
561 for(i=0;i<2;i++) {
562 pixels[0] = cm[pixels[0] + block[0]];
563 pixels[1] = cm[pixels[1] + block[1]];
564 pixels += line_size;
565 block += 8;
566 }
567 }
568
569 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
570 {
571 int i;
572 for(i=0;i<8;i++) {
573 pixels[0] += block[0];
574 pixels[1] += block[1];
575 pixels[2] += block[2];
576 pixels[3] += block[3];
577 pixels[4] += block[4];
578 pixels[5] += block[5];
579 pixels[6] += block[6];
580 pixels[7] += block[7];
581 pixels += line_size;
582 block += 8;
583 }
584 }
585
586 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
587 {
588 int i;
589 for(i=0;i<4;i++) {
590 pixels[0] += block[0];
591 pixels[1] += block[1];
592 pixels[2] += block[2];
593 pixels[3] += block[3];
594 pixels += line_size;
595 block += 4;
596 }
597 }
598
599 static int sum_abs_dctelem_c(DCTELEM *block)
600 {
601 int sum=0, i;
602 for(i=0; i<64; i++)
603 sum+= FFABS(block[i]);
604 return sum;
605 }
606
607 #if 0
608
609 #define PIXOP2(OPNAME, OP) \
610 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
611 {\
612 int i;\
613 for(i=0; i<h; i++){\
614 OP(*((uint64_t*)block), AV_RN64(pixels));\
615 pixels+=line_size;\
616 block +=line_size;\
617 }\
618 }\
619 \
620 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
621 {\
622 int i;\
623 for(i=0; i<h; i++){\
624 const uint64_t a= AV_RN64(pixels );\
625 const uint64_t b= AV_RN64(pixels+1);\
626 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
627 pixels+=line_size;\
628 block +=line_size;\
629 }\
630 }\
631 \
632 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
633 {\
634 int i;\
635 for(i=0; i<h; i++){\
636 const uint64_t a= AV_RN64(pixels );\
637 const uint64_t b= AV_RN64(pixels+1);\
638 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
639 pixels+=line_size;\
640 block +=line_size;\
641 }\
642 }\
643 \
644 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
645 {\
646 int i;\
647 for(i=0; i<h; i++){\
648 const uint64_t a= AV_RN64(pixels );\
649 const uint64_t b= AV_RN64(pixels+line_size);\
650 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
651 pixels+=line_size;\
652 block +=line_size;\
653 }\
654 }\
655 \
656 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657 {\
658 int i;\
659 for(i=0; i<h; i++){\
660 const uint64_t a= AV_RN64(pixels );\
661 const uint64_t b= AV_RN64(pixels+line_size);\
662 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
663 pixels+=line_size;\
664 block +=line_size;\
665 }\
666 }\
667 \
668 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
669 {\
670 int i;\
671 const uint64_t a= AV_RN64(pixels );\
672 const uint64_t b= AV_RN64(pixels+1);\
673 uint64_t l0= (a&0x0303030303030303ULL)\
674 + (b&0x0303030303030303ULL)\
675 + 0x0202020202020202ULL;\
676 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
677 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
678 uint64_t l1,h1;\
679 \
680 pixels+=line_size;\
681 for(i=0; i<h; i+=2){\
682 uint64_t a= AV_RN64(pixels );\
683 uint64_t b= AV_RN64(pixels+1);\
684 l1= (a&0x0303030303030303ULL)\
685 + (b&0x0303030303030303ULL);\
686 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
687 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
688 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
689 pixels+=line_size;\
690 block +=line_size;\
691 a= AV_RN64(pixels );\
692 b= AV_RN64(pixels+1);\
693 l0= (a&0x0303030303030303ULL)\
694 + (b&0x0303030303030303ULL)\
695 + 0x0202020202020202ULL;\
696 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
697 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
698 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
699 pixels+=line_size;\
700 block +=line_size;\
701 }\
702 }\
703 \
704 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
705 {\
706 int i;\
707 const uint64_t a= AV_RN64(pixels );\
708 const uint64_t b= AV_RN64(pixels+1);\
709 uint64_t l0= (a&0x0303030303030303ULL)\
710 + (b&0x0303030303030303ULL)\
711 + 0x0101010101010101ULL;\
712 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
713 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
714 uint64_t l1,h1;\
715 \
716 pixels+=line_size;\
717 for(i=0; i<h; i+=2){\
718 uint64_t a= AV_RN64(pixels );\
719 uint64_t b= AV_RN64(pixels+1);\
720 l1= (a&0x0303030303030303ULL)\
721 + (b&0x0303030303030303ULL);\
722 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
723 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
724 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
725 pixels+=line_size;\
726 block +=line_size;\
727 a= AV_RN64(pixels );\
728 b= AV_RN64(pixels+1);\
729 l0= (a&0x0303030303030303ULL)\
730 + (b&0x0303030303030303ULL)\
731 + 0x0101010101010101ULL;\
732 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
733 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
734 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
735 pixels+=line_size;\
736 block +=line_size;\
737 }\
738 }\
739 \
740 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
741 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
742 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
743 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
744 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
745 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
746 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
747
748 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
749 #else // 64 bit variant
750
751 #define PIXOP2(OPNAME, OP) \
752 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
753 int i;\
754 for(i=0; i<h; i++){\
755 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
756 pixels+=line_size;\
757 block +=line_size;\
758 }\
759 }\
760 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
761 int i;\
762 for(i=0; i<h; i++){\
763 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
764 pixels+=line_size;\
765 block +=line_size;\
766 }\
767 }\
768 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
769 int i;\
770 for(i=0; i<h; i++){\
771 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
772 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
773 pixels+=line_size;\
774 block +=line_size;\
775 }\
776 }\
777 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
778 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
779 }\
780 \
781 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782 int src_stride1, int src_stride2, int h){\
783 int i;\
784 for(i=0; i<h; i++){\
785 uint32_t a,b;\
786 a= AV_RN32(&src1[i*src_stride1 ]);\
787 b= AV_RN32(&src2[i*src_stride2 ]);\
788 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
789 a= AV_RN32(&src1[i*src_stride1+4]);\
790 b= AV_RN32(&src2[i*src_stride2+4]);\
791 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
792 }\
793 }\
794 \
795 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796 int src_stride1, int src_stride2, int h){\
797 int i;\
798 for(i=0; i<h; i++){\
799 uint32_t a,b;\
800 a= AV_RN32(&src1[i*src_stride1 ]);\
801 b= AV_RN32(&src2[i*src_stride2 ]);\
802 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
803 a= AV_RN32(&src1[i*src_stride1+4]);\
804 b= AV_RN32(&src2[i*src_stride2+4]);\
805 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
806 }\
807 }\
808 \
809 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
810 int src_stride1, int src_stride2, int h){\
811 int i;\
812 for(i=0; i<h; i++){\
813 uint32_t a,b;\
814 a= AV_RN32(&src1[i*src_stride1 ]);\
815 b= AV_RN32(&src2[i*src_stride2 ]);\
816 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
817 }\
818 }\
819 \
820 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
821 int src_stride1, int src_stride2, int h){\
822 int i;\
823 for(i=0; i<h; i++){\
824 uint32_t a,b;\
825 a= AV_RN16(&src1[i*src_stride1 ]);\
826 b= AV_RN16(&src2[i*src_stride2 ]);\
827 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
828 }\
829 }\
830 \
831 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
832 int src_stride1, int src_stride2, int h){\
833 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
834 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
835 }\
836 \
837 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
838 int src_stride1, int src_stride2, int h){\
839 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
840 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
841 }\
842 \
843 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
845 }\
846 \
847 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
848 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
849 }\
850 \
851 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
853 }\
854 \
855 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
856 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
857 }\
858 \
859 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
860 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
861 int i;\
862 for(i=0; i<h; i++){\
863 uint32_t a, b, c, d, l0, l1, h0, h1;\
864 a= AV_RN32(&src1[i*src_stride1]);\
865 b= AV_RN32(&src2[i*src_stride2]);\
866 c= AV_RN32(&src3[i*src_stride3]);\
867 d= AV_RN32(&src4[i*src_stride4]);\
868 l0= (a&0x03030303UL)\
869 + (b&0x03030303UL)\
870 + 0x02020202UL;\
871 h0= ((a&0xFCFCFCFCUL)>>2)\
872 + ((b&0xFCFCFCFCUL)>>2);\
873 l1= (c&0x03030303UL)\
874 + (d&0x03030303UL);\
875 h1= ((c&0xFCFCFCFCUL)>>2)\
876 + ((d&0xFCFCFCFCUL)>>2);\
877 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
878 a= AV_RN32(&src1[i*src_stride1+4]);\
879 b= AV_RN32(&src2[i*src_stride2+4]);\
880 c= AV_RN32(&src3[i*src_stride3+4]);\
881 d= AV_RN32(&src4[i*src_stride4+4]);\
882 l0= (a&0x03030303UL)\
883 + (b&0x03030303UL)\
884 + 0x02020202UL;\
885 h0= ((a&0xFCFCFCFCUL)>>2)\
886 + ((b&0xFCFCFCFCUL)>>2);\
887 l1= (c&0x03030303UL)\
888 + (d&0x03030303UL);\
889 h1= ((c&0xFCFCFCFCUL)>>2)\
890 + ((d&0xFCFCFCFCUL)>>2);\
891 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
892 }\
893 }\
894 \
895 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
897 }\
898 \
899 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
901 }\
902 \
903 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
905 }\
906 \
907 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
909 }\
910 \
911 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
912 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
913 int i;\
914 for(i=0; i<h; i++){\
915 uint32_t a, b, c, d, l0, l1, h0, h1;\
916 a= AV_RN32(&src1[i*src_stride1]);\
917 b= AV_RN32(&src2[i*src_stride2]);\
918 c= AV_RN32(&src3[i*src_stride3]);\
919 d= AV_RN32(&src4[i*src_stride4]);\
920 l0= (a&0x03030303UL)\
921 + (b&0x03030303UL)\
922 + 0x01010101UL;\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
926 + (d&0x03030303UL);\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930 a= AV_RN32(&src1[i*src_stride1+4]);\
931 b= AV_RN32(&src2[i*src_stride2+4]);\
932 c= AV_RN32(&src3[i*src_stride3+4]);\
933 d= AV_RN32(&src4[i*src_stride4+4]);\
934 l0= (a&0x03030303UL)\
935 + (b&0x03030303UL)\
936 + 0x01010101UL;\
937 h0= ((a&0xFCFCFCFCUL)>>2)\
938 + ((b&0xFCFCFCFCUL)>>2);\
939 l1= (c&0x03030303UL)\
940 + (d&0x03030303UL);\
941 h1= ((c&0xFCFCFCFCUL)>>2)\
942 + ((d&0xFCFCFCFCUL)>>2);\
943 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
944 }\
945 }\
946 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
947 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
948 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
949 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
950 }\
951 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
952 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
953 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
954 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
955 }\
956 \
957 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
958 {\
959 int i, a0, b0, a1, b1;\
960 a0= pixels[0];\
961 b0= pixels[1] + 2;\
962 a0 += b0;\
963 b0 += pixels[2];\
964 \
965 pixels+=line_size;\
966 for(i=0; i<h; i+=2){\
967 a1= pixels[0];\
968 b1= pixels[1];\
969 a1 += b1;\
970 b1 += pixels[2];\
971 \
972 block[0]= (a1+a0)>>2; /* FIXME non put */\
973 block[1]= (b1+b0)>>2;\
974 \
975 pixels+=line_size;\
976 block +=line_size;\
977 \
978 a0= pixels[0];\
979 b0= pixels[1] + 2;\
980 a0 += b0;\
981 b0 += pixels[2];\
982 \
983 block[0]= (a1+a0)>>2;\
984 block[1]= (b1+b0)>>2;\
985 pixels+=line_size;\
986 block +=line_size;\
987 }\
988 }\
989 \
990 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
991 {\
992 int i;\
993 const uint32_t a= AV_RN32(pixels );\
994 const uint32_t b= AV_RN32(pixels+1);\
995 uint32_t l0= (a&0x03030303UL)\
996 + (b&0x03030303UL)\
997 + 0x02020202UL;\
998 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
999 + ((b&0xFCFCFCFCUL)>>2);\
1000 uint32_t l1,h1;\
1001 \
1002 pixels+=line_size;\
1003 for(i=0; i<h; i+=2){\
1004 uint32_t a= AV_RN32(pixels );\
1005 uint32_t b= AV_RN32(pixels+1);\
1006 l1= (a&0x03030303UL)\
1007 + (b&0x03030303UL);\
1008 h1= ((a&0xFCFCFCFCUL)>>2)\
1009 + ((b&0xFCFCFCFCUL)>>2);\
1010 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011 pixels+=line_size;\
1012 block +=line_size;\
1013 a= AV_RN32(pixels );\
1014 b= AV_RN32(pixels+1);\
1015 l0= (a&0x03030303UL)\
1016 + (b&0x03030303UL)\
1017 + 0x02020202UL;\
1018 h0= ((a&0xFCFCFCFCUL)>>2)\
1019 + ((b&0xFCFCFCFCUL)>>2);\
1020 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1021 pixels+=line_size;\
1022 block +=line_size;\
1023 }\
1024 }\
1025 \
1026 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1027 {\
1028 int j;\
1029 for(j=0; j<2; j++){\
1030 int i;\
1031 const uint32_t a= AV_RN32(pixels );\
1032 const uint32_t b= AV_RN32(pixels+1);\
1033 uint32_t l0= (a&0x03030303UL)\
1034 + (b&0x03030303UL)\
1035 + 0x02020202UL;\
1036 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1037 + ((b&0xFCFCFCFCUL)>>2);\
1038 uint32_t l1,h1;\
1039 \
1040 pixels+=line_size;\
1041 for(i=0; i<h; i+=2){\
1042 uint32_t a= AV_RN32(pixels );\
1043 uint32_t b= AV_RN32(pixels+1);\
1044 l1= (a&0x03030303UL)\
1045 + (b&0x03030303UL);\
1046 h1= ((a&0xFCFCFCFCUL)>>2)\
1047 + ((b&0xFCFCFCFCUL)>>2);\
1048 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1049 pixels+=line_size;\
1050 block +=line_size;\
1051 a= AV_RN32(pixels );\
1052 b= AV_RN32(pixels+1);\
1053 l0= (a&0x03030303UL)\
1054 + (b&0x03030303UL)\
1055 + 0x02020202UL;\
1056 h0= ((a&0xFCFCFCFCUL)>>2)\
1057 + ((b&0xFCFCFCFCUL)>>2);\
1058 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1059 pixels+=line_size;\
1060 block +=line_size;\
1061 }\
1062 pixels+=4-line_size*(h+1);\
1063 block +=4-line_size*h;\
1064 }\
1065 }\
1066 \
1067 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1068 {\
1069 int j;\
1070 for(j=0; j<2; j++){\
1071 int i;\
1072 const uint32_t a= AV_RN32(pixels );\
1073 const uint32_t b= AV_RN32(pixels+1);\
1074 uint32_t l0= (a&0x03030303UL)\
1075 + (b&0x03030303UL)\
1076 + 0x01010101UL;\
1077 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1078 + ((b&0xFCFCFCFCUL)>>2);\
1079 uint32_t l1,h1;\
1080 \
1081 pixels+=line_size;\
1082 for(i=0; i<h; i+=2){\
1083 uint32_t a= AV_RN32(pixels );\
1084 uint32_t b= AV_RN32(pixels+1);\
1085 l1= (a&0x03030303UL)\
1086 + (b&0x03030303UL);\
1087 h1= ((a&0xFCFCFCFCUL)>>2)\
1088 + ((b&0xFCFCFCFCUL)>>2);\
1089 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1090 pixels+=line_size;\
1091 block +=line_size;\
1092 a= AV_RN32(pixels );\
1093 b= AV_RN32(pixels+1);\
1094 l0= (a&0x03030303UL)\
1095 + (b&0x03030303UL)\
1096 + 0x01010101UL;\
1097 h0= ((a&0xFCFCFCFCUL)>>2)\
1098 + ((b&0xFCFCFCFCUL)>>2);\
1099 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100 pixels+=line_size;\
1101 block +=line_size;\
1102 }\
1103 pixels+=4-line_size*(h+1);\
1104 block +=4-line_size*h;\
1105 }\
1106 }\
1107 \
1108 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1109 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1110 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1111 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1112 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1113 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1114 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1115 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1116
1117 #define op_avg(a, b) a = rnd_avg32(a, b)
1118 #endif
1119 #define op_put(a, b) a = b
1120
1121 PIXOP2(avg, op_avg)
1122 PIXOP2(put, op_put)
1123 #undef op_avg
1124 #undef op_put
1125
1126 #define avg2(a,b) ((a+b+1)>>1)
1127 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1128
1129 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1130 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1131 }
1132
1133 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1134 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1135 }
1136
1137 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1138 {
1139 const int A=(16-x16)*(16-y16);
1140 const int B=( x16)*(16-y16);
1141 const int C=(16-x16)*( y16);
1142 const int D=( x16)*( y16);
1143 int i;
1144
1145 for(i=0; i<h; i++)
1146 {
1147 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1148 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1149 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1150 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1151 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1152 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1153 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1154 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1155 dst+= stride;
1156 src+= stride;
1157 }
1158 }
1159
1160 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1161 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1162 {
1163 int y, vx, vy;
1164 const int s= 1<<shift;
1165
1166 width--;
1167 height--;
1168
1169 for(y=0; y<h; y++){
1170 int x;
1171
1172 vx= ox;
1173 vy= oy;
1174 for(x=0; x<8; x++){ //XXX FIXME optimize
1175 int src_x, src_y, frac_x, frac_y, index;
1176
1177 src_x= vx>>16;
1178 src_y= vy>>16;
1179 frac_x= src_x&(s-1);
1180 frac_y= src_y&(s-1);
1181 src_x>>=shift;
1182 src_y>>=shift;
1183
1184 if((unsigned)src_x < width){
1185 if((unsigned)src_y < height){
1186 index= src_x + src_y*stride;
1187 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1188 + src[index +1]* frac_x )*(s-frac_y)
1189 + ( src[index+stride ]*(s-frac_x)
1190 + src[index+stride+1]* frac_x )* frac_y
1191 + r)>>(shift*2);
1192 }else{
1193 index= src_x + av_clip(src_y, 0, height)*stride;
1194 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1195 + src[index +1]* frac_x )*s
1196 + r)>>(shift*2);
1197 }
1198 }else{
1199 if((unsigned)src_y < height){
1200 index= av_clip(src_x, 0, width) + src_y*stride;
1201 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1202 + src[index+stride ]* frac_y )*s
1203 + r)>>(shift*2);
1204 }else{
1205 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1206 dst[y*stride + x]= src[index ];
1207 }
1208 }
1209
1210 vx+= dxx;
1211 vy+= dyx;
1212 }
1213 ox += dxy;
1214 oy += dyy;
1215 }
1216 }
1217
1218 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1219 switch(width){
1220 case 2: put_pixels2_c (dst, src, stride, height); break;
1221 case 4: put_pixels4_c (dst, src, stride, height); break;
1222 case 8: put_pixels8_c (dst, src, stride, height); break;
1223 case 16:put_pixels16_c(dst, src, stride, height); break;
1224 }
1225 }
1226
1227 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1228 int i,j;
1229 for (i=0; i < height; i++) {
1230 for (j=0; j < width; j++) {
1231 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1232 }
1233 src += stride;
1234 dst += stride;
1235 }
1236 }
1237
1238 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1239 int i,j;
1240 for (i=0; i < height; i++) {
1241 for (j=0; j < width; j++) {
1242 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1243 }
1244 src += stride;
1245 dst += stride;
1246 }
1247 }
1248
1249 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1250 int i,j;
1251 for (i=0; i < height; i++) {
1252 for (j=0; j < width; j++) {
1253 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1254 }
1255 src += stride;
1256 dst += stride;
1257 }
1258 }
1259
1260 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1261 int i,j;
1262 for (i=0; i < height; i++) {
1263 for (j=0; j < width; j++) {
1264 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1265 }
1266 src += stride;
1267 dst += stride;
1268 }
1269 }
1270
1271 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1272 int i,j;
1273 for (i=0; i < height; i++) {
1274 for (j=0; j < width; j++) {
1275 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1276 }
1277 src += stride;
1278 dst += stride;
1279 }
1280 }
1281
1282 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1283 int i,j;
1284 for (i=0; i < height; i++) {
1285 for (j=0; j < width; j++) {
1286 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1287 }
1288 src += stride;
1289 dst += stride;
1290 }
1291 }
1292
1293 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1294 int i,j;
1295 for (i=0; i < height; i++) {
1296 for (j=0; j < width; j++) {
1297 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1298 }
1299 src += stride;
1300 dst += stride;
1301 }
1302 }
1303
1304 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1305 int i,j;
1306 for (i=0; i < height; i++) {
1307 for (j=0; j < width; j++) {
1308 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1309 }
1310 src += stride;
1311 dst += stride;
1312 }
1313 }
1314
1315 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1316 switch(width){
1317 case 2: avg_pixels2_c (dst, src, stride, height); break;
1318 case 4: avg_pixels4_c (dst, src, stride, height); break;
1319 case 8: avg_pixels8_c (dst, src, stride, height); break;
1320 case 16:avg_pixels16_c(dst, src, stride, height); break;
1321 }
1322 }
1323
1324 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1325 int i,j;
1326 for (i=0; i < height; i++) {
1327 for (j=0; j < width; j++) {
1328 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1329 }
1330 src += stride;
1331 dst += stride;
1332 }
1333 }
1334
1335 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1336 int i,j;
1337 for (i=0; i < height; i++) {
1338 for (j=0; j < width; j++) {
1339 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1340 }
1341 src += stride;
1342 dst += stride;
1343 }
1344 }
1345
1346 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1347 int i,j;
1348 for (i=0; i < height; i++) {
1349 for (j=0; j < width; j++) {
1350 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1351 }
1352 src += stride;
1353 dst += stride;
1354 }
1355 }
1356
1357 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358 int i,j;
1359 for (i=0; i < height; i++) {
1360 for (j=0; j < width; j++) {
1361 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1362 }
1363 src += stride;
1364 dst += stride;
1365 }
1366 }
1367
1368 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1369 int i,j;
1370 for (i=0; i < height; i++) {
1371 for (j=0; j < width; j++) {
1372 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1373 }
1374 src += stride;
1375 dst += stride;
1376 }
1377 }
1378
1379 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380 int i,j;
1381 for (i=0; i < height; i++) {
1382 for (j=0; j < width; j++) {
1383 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1384 }
1385 src += stride;
1386 dst += stride;
1387 }
1388 }
1389
1390 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1391 int i,j;
1392 for (i=0; i < height; i++) {
1393 for (j=0; j < width; j++) {
1394 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1395 }
1396 src += stride;
1397 dst += stride;
1398 }
1399 }
1400
1401 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1402 int i,j;
1403 for (i=0; i < height; i++) {
1404 for (j=0; j < width; j++) {
1405 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1406 }
1407 src += stride;
1408 dst += stride;
1409 }
1410 }
1411 #if 0
1412 #define TPEL_WIDTH(width)\
1413 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1419 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1421 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1423 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1425 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1427 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1428 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1429 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1430 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1431 #endif
1432
1433 #define H264_CHROMA_MC(OPNAME, OP)\
1434 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1435 const int A=(8-x)*(8-y);\
1436 const int B=( x)*(8-y);\
1437 const int C=(8-x)*( y);\
1438 const int D=( x)*( y);\
1439 int i;\
1440 \
1441 assert(x<8 && y<8 && x>=0 && y>=0);\
1442 \
1443 if(D){\
1444 for(i=0; i<h; i++){\
1445 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1446 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1447 dst+= stride;\
1448 src+= stride;\
1449 }\
1450 }else{\
1451 const int E= B+C;\
1452 const int step= C ? stride : 1;\
1453 for(i=0; i<h; i++){\
1454 OP(dst[0], (A*src[0] + E*src[step+0]));\
1455 OP(dst[1], (A*src[1] + E*src[step+1]));\
1456 dst+= stride;\
1457 src+= stride;\
1458 }\
1459 }\
1460 }\
1461 \
1462 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1463 const int A=(8-x)*(8-y);\
1464 const int B=( x)*(8-y);\
1465 const int C=(8-x)*( y);\
1466 const int D=( x)*( y);\
1467 int i;\
1468 \
1469 assert(x<8 && y<8 && x>=0 && y>=0);\
1470 \
1471 if(D){\
1472 for(i=0; i<h; i++){\
1473 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1474 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1475 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1476 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1477 dst+= stride;\
1478 src+= stride;\
1479 }\
1480 }else{\
1481 const int E= B+C;\
1482 const int step= C ? stride : 1;\
1483 for(i=0; i<h; i++){\
1484 OP(dst[0], (A*src[0] + E*src[step+0]));\
1485 OP(dst[1], (A*src[1] + E*src[step+1]));\
1486 OP(dst[2], (A*src[2] + E*src[step+2]));\
1487 OP(dst[3], (A*src[3] + E*src[step+3]));\
1488 dst+= stride;\
1489 src+= stride;\
1490 }\
1491 }\
1492 }\
1493 \
1494 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1495 const int A=(8-x)*(8-y);\
1496 const int B=( x)*(8-y);\
1497 const int C=(8-x)*( y);\
1498 const int D=( x)*( y);\
1499 int i;\
1500 \
1501 assert(x<8 && y<8 && x>=0 && y>=0);\
1502 \
1503 if(D){\
1504 for(i=0; i<h; i++){\
1505 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1506 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1507 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1508 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1509 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1510 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1511 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1512 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1513 dst+= stride;\
1514 src+= stride;\
1515 }\
1516 }else{\
1517 const int E= B+C;\
1518 const int step= C ? stride : 1;\
1519 for(i=0; i<h; i++){\
1520 OP(dst[0], (A*src[0] + E*src[step+0]));\
1521 OP(dst[1], (A*src[1] + E*src[step+1]));\
1522 OP(dst[2], (A*src[2] + E*src[step+2]));\
1523 OP(dst[3], (A*src[3] + E*src[step+3]));\
1524 OP(dst[4], (A*src[4] + E*src[step+4]));\
1525 OP(dst[5], (A*src[5] + E*src[step+5]));\
1526 OP(dst[6], (A*src[6] + E*src[step+6]));\
1527 OP(dst[7], (A*src[7] + E*src[step+7]));\
1528 dst+= stride;\
1529 src+= stride;\
1530 }\
1531 }\
1532 }
1533
1534 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1535 #define op_put(a, b) a = (((b) + 32)>>6)
1536
1537 H264_CHROMA_MC(put_ , op_put)
1538 H264_CHROMA_MC(avg_ , op_avg)
1539 #undef op_avg
1540 #undef op_put
1541
1542 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1543 const int A=(8-x)*(8-y);
1544 const int B=( x)*(8-y);
1545 const int C=(8-x)*( y);
1546 const int D=( x)*( y);
1547 int i;
1548
1549 assert(x<8 && y<8 && x>=0 && y>=0);
1550
1551 for(i=0; i<h; i++)
1552 {
1553 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1554 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1555 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1556 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1557 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1558 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1559 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1560 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1561 dst+= stride;
1562 src+= stride;
1563 }
1564 }
1565
1566 #define QPEL_MC(r, OPNAME, RND, OP) \
1567 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1568 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1569 int i;\
1570 for(i=0; i<h; i++)\
1571 {\
1572 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1573 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1574 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1575 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1576 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1577 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1578 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1579 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1580 dst+=dstStride;\
1581 src+=srcStride;\
1582 }\
1583 }\
1584 \
1585 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1586 const int w=8;\
1587 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1588 int i;\
1589 for(i=0; i<w; i++)\
1590 {\
1591 const int src0= src[0*srcStride];\
1592 const int src1= src[1*srcStride];\
1593 const int src2= src[2*srcStride];\
1594 const int src3= src[3*srcStride];\
1595 const int src4= src[4*srcStride];\
1596 const int src5= src[5*srcStride];\
1597 const int src6= src[6*srcStride];\
1598 const int src7= src[7*srcStride];\
1599 const int src8= src[8*srcStride];\
1600 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1601 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1602 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1603 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1604 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1605 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1606 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1607 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1608 dst++;\
1609 src++;\
1610 }\
1611 }\
1612 \
1613 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1614 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1615 int i;\
1616 \
1617 for(i=0; i<h; i++)\
1618 {\
1619 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1620 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1621 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1622 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1623 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1624 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1625 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1626 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1627 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1628 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1629 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1630 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1631 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1632 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1633 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1634 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1635 dst+=dstStride;\
1636 src+=srcStride;\
1637 }\
1638 }\
1639 \
1640 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1641 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1642 int i;\
1643 const int w=16;\
1644 for(i=0; i<w; i++)\
1645 {\
1646 const int src0= src[0*srcStride];\
1647 const int src1= src[1*srcStride];\
1648 const int src2= src[2*srcStride];\
1649 const int src3= src[3*srcStride];\
1650 const int src4= src[4*srcStride];\
1651 const int src5= src[5*srcStride];\
1652 const int src6= src[6*srcStride];\
1653 const int src7= src[7*srcStride];\
1654 const int src8= src[8*srcStride];\
1655 const int src9= src[9*srcStride];\
1656 const int src10= src[10*srcStride];\
1657 const int src11= src[11*srcStride];\
1658 const int src12= src[12*srcStride];\
1659 const int src13= src[13*srcStride];\
1660 const int src14= src[14*srcStride];\
1661 const int src15= src[15*srcStride];\
1662 const int src16= src[16*srcStride];\
1663 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1664 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1665 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1666 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1667 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1668 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1669 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1670 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1671 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1672 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1673 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1674 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1675 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1676 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1677 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1678 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1679 dst++;\
1680 src++;\
1681 }\
1682 }\
1683 \
1684 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1685 OPNAME ## pixels8_c(dst, src, stride, 8);\
1686 }\
1687 \
1688 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1689 uint8_t half[64];\
1690 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1691 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1692 }\
1693 \
1694 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1695 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1696 }\
1697 \
1698 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1699 uint8_t half[64];\
1700 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1701 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1702 }\
1703 \
1704 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1705 uint8_t full[16*9];\
1706 uint8_t half[64];\
1707 copy_block9(full, src, 16, stride, 9);\
1708 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1709 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1710 }\
1711 \
1712 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[16*9];\
1714 copy_block9(full, src, 16, stride, 9);\
1715 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1716 }\
1717 \
1718 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1719 uint8_t full[16*9];\
1720 uint8_t half[64];\
1721 copy_block9(full, src, 16, stride, 9);\
1722 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1723 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1724 }\
1725 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1726 uint8_t full[16*9];\
1727 uint8_t halfH[72];\
1728 uint8_t halfV[64];\
1729 uint8_t halfHV[64];\
1730 copy_block9(full, src, 16, stride, 9);\
1731 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1732 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1734 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1735 }\
1736 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1737 uint8_t full[16*9];\
1738 uint8_t halfH[72];\
1739 uint8_t halfHV[64];\
1740 copy_block9(full, src, 16, stride, 9);\
1741 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1742 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1743 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1744 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1745 }\
1746 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1747 uint8_t full[16*9];\
1748 uint8_t halfH[72];\
1749 uint8_t halfV[64];\
1750 uint8_t halfHV[64];\
1751 copy_block9(full, src, 16, stride, 9);\
1752 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1753 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1754 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1755 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1756 }\
1757 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1758 uint8_t full[16*9];\
1759 uint8_t halfH[72];\
1760 uint8_t halfHV[64];\
1761 copy_block9(full, src, 16, stride, 9);\
1762 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1763 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1764 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1765 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1766 }\
1767 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1768 uint8_t full[16*9];\
1769 uint8_t halfH[72];\
1770 uint8_t halfV[64];\
1771 uint8_t halfHV[64];\
1772 copy_block9(full, src, 16, stride, 9);\
1773 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1774 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1775 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1776 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1777 }\
1778 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t full[16*9];\
1780 uint8_t halfH[72];\
1781 uint8_t halfHV[64];\
1782 copy_block9(full, src, 16, stride, 9);\
1783 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1784 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1785 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1786 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1787 }\
1788 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1789 uint8_t full[16*9];\
1790 uint8_t halfH[72];\
1791 uint8_t halfV[64];\
1792 uint8_t halfHV[64];\
1793 copy_block9(full, src, 16, stride, 9);\
1794 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1795 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1796 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1797 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1798 }\
1799 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1800 uint8_t full[16*9];\
1801 uint8_t halfH[72];\
1802 uint8_t halfHV[64];\
1803 copy_block9(full, src, 16, stride, 9);\
1804 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1805 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1806 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1807 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1808 }\
1809 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1810 uint8_t halfH[72];\
1811 uint8_t halfHV[64];\
1812 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1813 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1814 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1815 }\
1816 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1817 uint8_t halfH[72];\
1818 uint8_t halfHV[64];\
1819 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1820 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1821 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1822 }\
1823 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1824 uint8_t full[16*9];\
1825 uint8_t halfH[72];\
1826 uint8_t halfV[64];\
1827 uint8_t halfHV[64];\
1828 copy_block9(full, src, 16, stride, 9);\
1829 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1830 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1831 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1832 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1833 }\
1834 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1835 uint8_t full[16*9];\
1836 uint8_t halfH[72];\
1837 copy_block9(full, src, 16, stride, 9);\
1838 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1840 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1841 }\
1842 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1843 uint8_t full[16*9];\
1844 uint8_t halfH[72];\
1845 uint8_t halfV[64];\
1846 uint8_t halfHV[64];\
1847 copy_block9(full, src, 16, stride, 9);\
1848 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1849 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1850 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1851 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1852 }\
1853 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t full[16*9];\
1855 uint8_t halfH[72];\
1856 copy_block9(full, src, 16, stride, 9);\
1857 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1858 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1859 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1860 }\
1861 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1862 uint8_t halfH[72];\
1863 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1864 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1865 }\
1866 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1867 OPNAME ## pixels16_c(dst, src, stride, 16);\
1868 }\
1869 \
1870 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1871 uint8_t half[256];\
1872 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1873 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1874 }\
1875 \
1876 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1877 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1878 }\
1879 \
1880 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1881 uint8_t half[256];\
1882 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1883 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1884 }\
1885 \
1886 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1887 uint8_t full[24*17];\
1888 uint8_t half[256];\
1889 copy_block17(full, src, 24, stride, 17);\
1890 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1891 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1892 }\
1893 \
1894 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1895 uint8_t full[24*17];\
1896 copy_block17(full, src, 24, stride, 17);\
1897 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1898 }\
1899 \
1900 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1901 uint8_t full[24*17];\
1902 uint8_t half[256];\
1903 copy_block17(full, src, 24, stride, 17);\
1904 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1905 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1906 }\
1907 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908 uint8_t full[24*17];\
1909 uint8_t halfH[272];\
1910 uint8_t halfV[256];\
1911 uint8_t halfHV[256];\
1912 copy_block17(full, src, 24, stride, 17);\
1913 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1914 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1916 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1917 }\
1918 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1919 uint8_t full[24*17];\
1920 uint8_t halfH[272];\
1921 uint8_t halfHV[256];\
1922 copy_block17(full, src, 24, stride, 17);\
1923 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1924 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1925 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1926 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1927 }\
1928 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929 uint8_t full[24*17];\
1930 uint8_t halfH[272];\
1931 uint8_t halfV[256];\
1932 uint8_t halfHV[256];\
1933 copy_block17(full, src, 24, stride, 17);\
1934 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1935 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1936 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1937 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1938 }\
1939 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1940 uint8_t full[24*17];\
1941 uint8_t halfH[272];\
1942 uint8_t halfHV[256];\
1943 copy_block17(full, src, 24, stride, 17);\
1944 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1945 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1946 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1947 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1948 }\
1949 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1950 uint8_t full[24*17];\
1951 uint8_t halfH[272];\
1952 uint8_t halfV[256];\
1953 uint8_t halfHV[256];\
1954 copy_block17(full, src, 24, stride, 17);\
1955 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1956 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1957 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1958 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1959 }\
1960 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1961 uint8_t full[24*17];\
1962 uint8_t halfH[272];\
1963 uint8_t halfHV[256];\
1964 copy_block17(full, src, 24, stride, 17);\
1965 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1966 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1967 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1968 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1969 }\
1970 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1971 uint8_t full[24*17];\
1972 uint8_t halfH[272];\
1973 uint8_t halfV[256];\
1974 uint8_t halfHV[256];\
1975 copy_block17(full, src, 24, stride, 17);\
1976 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1977 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1978 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1979 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1980 }\
1981 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1982 uint8_t full[24*17];\
1983 uint8_t halfH[272];\
1984 uint8_t halfHV[256];\
1985 copy_block17(full, src, 24, stride, 17);\
1986 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1987 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1988 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1989 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1990 }\
1991 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1992 uint8_t halfH[272];\
1993 uint8_t halfHV[256];\
1994 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1995 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1996 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1997 }\
1998 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1999 uint8_t halfH[272];\
2000 uint8_t halfHV[256];\
2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2002 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2003 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2004 }\
2005 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2006 uint8_t full[24*17];\
2007 uint8_t halfH[272];\
2008 uint8_t halfV[256];\
2009 uint8_t halfHV[256];\
2010 copy_block17(full, src, 24, stride, 17);\
2011 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2012 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2013 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2014 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2015 }\
2016 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2017 uint8_t full[24*17];\
2018 uint8_t halfH[272];\
2019 copy_block17(full, src, 24, stride, 17);\
2020 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2021 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2022 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2023 }\
2024 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2025 uint8_t full[24*17];\
2026 uint8_t halfH[272];\
2027 uint8_t halfV[256];\
2028 uint8_t halfHV[256];\
2029 copy_block17(full, src, 24, stride, 17);\
2030 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2031 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2032 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2033 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2034 }\
2035 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2036 uint8_t full[24*17];\
2037 uint8_t halfH[272];\
2038 copy_block17(full, src, 24, stride, 17);\
2039 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2040 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2041 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2042 }\
2043 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2044 uint8_t halfH[272];\
2045 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2046 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2047 }
2048
2049 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2050 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2051 #define op_put(a, b) a = cm[((b) + 16)>>5]
2052 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2053
2054 QPEL_MC(0, put_ , _ , op_put)
2055 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2056 QPEL_MC(0, avg_ , _ , op_avg)
2057 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2058 #undef op_avg
2059 #undef op_avg_no_rnd
2060 #undef op_put
2061 #undef op_put_no_rnd
2062
2063 #if 1
2064 #define H264_LOWPASS(OPNAME, OP, OP2) \
2065 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2066 const int h=2;\
2067 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2068 int i;\
2069 for(i=0; i<h; i++)\
2070 {\
2071 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2072 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2073 dst+=dstStride;\
2074 src+=srcStride;\
2075 }\
2076 }\
2077 \
2078 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2079 const int w=2;\
2080 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2081 int i;\
2082 for(i=0; i<w; i++)\
2083 {\
2084 const int srcB= src[-2*srcStride];\
2085 const int srcA= src[-1*srcStride];\
2086 const int src0= src[0 *srcStride];\
2087 const int src1= src[1 *srcStride];\
2088 const int src2= src[2 *srcStride];\
2089 const int src3= src[3 *srcStride];\
2090 const int src4= src[4 *srcStride];\
2091 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2092 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2093 dst++;\
2094 src++;\
2095 }\
2096 }\
2097 \
2098 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2099 const int h=2;\
2100 const int w=2;\
2101 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2102 int i;\
2103 src -= 2*srcStride;\
2104 for(i=0; i<h+5; i++)\
2105 {\
2106 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2107 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2108 tmp+=tmpStride;\
2109 src+=srcStride;\
2110 }\
2111 tmp -= tmpStride*(h+5-2);\
2112 for(i=0; i<w; i++)\
2113 {\
2114 const int tmpB= tmp[-2*tmpStride];\
2115 const int tmpA= tmp[-1*tmpStride];\
2116 const int tmp0= tmp[0 *tmpStride];\
2117 const int tmp1= tmp[1 *tmpStride];\
2118 const int tmp2= tmp[2 *tmpStride];\
2119 const int tmp3= tmp[3 *tmpStride];\
2120 const int tmp4= tmp[4 *tmpStride];\
2121 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2122 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2123 dst++;\
2124 tmp++;\
2125 }\
2126 }\
2127 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2128 const int h=4;\
2129 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2130 int i;\
2131 for(i=0; i<h; i++)\
2132 {\
2133 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2134 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2135 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2136 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2137 dst+=dstStride;\
2138 src+=srcStride;\
2139 }\
2140 }\
2141 \
2142 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2143 const int w=4;\
2144 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2145 int i;\
2146 for(i=0; i<w; i++)\
2147 {\
2148 const int srcB= src[-2*srcStride];\
2149 const int srcA= src[-1*srcStride];\
2150 const int src0= src[0 *srcStride];\
2151 const int src1= src[1 *srcStride];\
2152 const int src2= src[2 *srcStride];\
2153 const int src3= src[3 *srcStride];\
2154 const int src4= src[4 *srcStride];\
2155 const int src5= src[5 *srcStride];\
2156 const int src6= src[6 *srcStride];\
2157 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2158 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2159 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2160 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2161 dst++;\
2162 src++;\
2163 }\
2164 }\
2165 \
2166 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2167 const int h=4;\
2168 const int w=4;\
2169 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2170 int i;\
2171 src -= 2*srcStride;\
2172 for(i=0; i<h+5; i++)\
2173 {\
2174 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2175 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2176 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2177 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2178 tmp+=tmpStride;\
2179 src+=srcStride;\
2180 }\
2181 tmp -= tmpStride*(h+5-2);\
2182 for(i=0; i<w; i++)\
2183 {\
2184 const int tmpB= tmp[-2*tmpStride];\
2185 const int tmpA= tmp[-1*tmpStride];\
2186 const int tmp0= tmp[0 *tmpStride];\
2187 const int tmp1= tmp[1 *tmpStride];\
2188 const int tmp2= tmp[2 *tmpStride];\
2189 const int tmp3= tmp[3 *tmpStride];\
2190 const int tmp4= tmp[4 *tmpStride];\
2191 const int tmp5= tmp[5 *tmpStride];\
2192 const int tmp6= tmp[6 *tmpStride];\
2193 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2194 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2195 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2196 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2197 dst++;\
2198 tmp++;\
2199 }\
2200 }\
2201 \
2202 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2203 const int h=8;\
2204 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2205 int i;\
2206 for(i=0; i<h; i++)\
2207 {\
2208 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2209 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2210 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2211 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2212 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2213 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2214 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2215 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2216 dst+=dstStride;\
2217 src+=srcStride;\
2218 }\
2219 }\
2220 \
2221 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2222 const int w=8;\
2223 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2224 int i;\
2225 for(i=0; i<w; i++)\
2226 {\
2227 const int srcB= src[-2*srcStride];\
2228 const int srcA= src[-1*srcStride];\
2229 const int src0= src[0 *srcStride];\
2230 const int src1= src[1 *srcStride];\
2231 const int src2= src[2 *srcStride];\
2232 const int src3= src[3 *srcStride];\
2233 const int src4= src[4 *srcStride];\
2234 const int src5= src[5 *srcStride];\
2235 const int src6= src[6 *srcStride];\
2236 const int src7= src[7 *srcStride];\
2237 const int src8= src[8 *srcStride];\
2238 const int src9= src[9 *srcStride];\
2239 const int src10=src[10*srcStride];\
2240 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2241 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2242 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2243 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2244 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2245 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2246 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2247 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2248 dst++;\
2249 src++;\
2250 }\
2251 }\
2252 \
2253 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2254 const int h=8;\
2255 const int w=8;\
2256 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2257 int i;\
2258 src -= 2*srcStride;\
2259 for(i=0; i<h+5; i++)\
2260 {\
2261 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2262 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2263 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2264 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2265 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2266 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2267 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2268 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2269 tmp+=tmpStride;\
2270 src+=srcStride;\
2271 }\
2272 tmp -= tmpStride*(h+5-2);\
2273 for(i=0; i<w; i++)\
2274 {\
2275 const int tmpB= tmp[-2*tmpStride];\
2276 const int tmpA= tmp[-1*tmpStride];\
2277 const int tmp0= tmp[0 *tmpStride];\
2278 const int tmp1= tmp[1 *tmpStride];\
2279 const int tmp2= tmp[2 *tmpStride];\
2280 const int tmp3= tmp[3 *tmpStride];\
2281 const int tmp4= tmp[4 *tmpStride];\
2282 const int tmp5= tmp[5 *tmpStride];\
2283 const int tmp6= tmp[6 *tmpStride];\
2284 const int tmp7= tmp[7 *tmpStride];\
2285 const int tmp8= tmp[8 *tmpStride];\
2286 const int tmp9= tmp[9 *tmpStride];\
2287 const int tmp10=tmp[10*tmpStride];\
2288 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2289 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2290 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2291 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2292 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2293 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2294 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2295 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2296 dst++;\
2297 tmp++;\
2298 }\
2299 }\
2300 \
2301 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2302 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2303 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2304 src += 8*srcStride;\
2305 dst += 8*dstStride;\
2306 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2307 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2308 }\
2309 \
2310 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2311 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2312 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2313 src += 8*srcStride;\
2314 dst += 8*dstStride;\
2315 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2316 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2317 }\
2318 \
2319 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2320 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2321 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2322 src += 8*srcStride;\
2323 dst += 8*dstStride;\
2324 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2325 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2326 }\
2327
2328 #define H264_MC(OPNAME, SIZE) \
2329 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2330 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2331 }\
2332 \
2333 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2334 uint8_t half[SIZE*SIZE];\
2335 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2336 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2337 }\
2338 \
2339 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2340 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2341 }\
2342 \
2343 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2344 uint8_t half[SIZE*SIZE];\
2345 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2346 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2347 }\
2348 \
2349 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2350 uint8_t full[SIZE*(SIZE+5)];\
2351 uint8_t * const full_mid= full + SIZE*2;\
2352 uint8_t half[SIZE*SIZE];\
2353 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2354 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2355 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2356 }\
2357 \
2358 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2359 uint8_t full[SIZE*(SIZE+5)];\
2360 uint8_t * const full_mid= full + SIZE*2;\
2361 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2362 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2363 }\
2364 \
2365 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2366 uint8_t full[SIZE*(SIZE+5)];\
2367 uint8_t * const full_mid= full + SIZE*2;\
2368 uint8_t half[SIZE*SIZE];\
2369 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2370 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2371 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2372 }\
2373 \
2374 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2375 uint8_t full[SIZE*(SIZE+5)];\
2376 uint8_t * const full_mid= full + SIZE*2;\
2377 uint8_t halfH[SIZE*SIZE];\
2378 uint8_t halfV[SIZE*SIZE];\
2379 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2380 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2381 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2382 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2383 }\
2384 \
2385 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2386 uint8_t full[SIZE*(SIZE+5)];\
2387 uint8_t * const full_mid= full + SIZE*2;\
2388 uint8_t halfH[SIZE*SIZE];\
2389 uint8_t halfV[SIZE*SIZE];\
2390 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2391 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2392 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2393 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2394 }\
2395 \
2396 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2397 uint8_t full[SIZE*(SIZE+5)];\
2398 uint8_t * const full_mid= full + SIZE*2;\
2399 uint8_t halfH[SIZE*SIZE];\
2400 uint8_t halfV[SIZE*SIZE];\
2401 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2402 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2403 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2404 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2405 }\
2406 \
2407 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2408 uint8_t full[SIZE*(SIZE+5)];\
2409 uint8_t * const full_mid= full + SIZE*2;\
2410 uint8_t halfH[SIZE*SIZE];\
2411 uint8_t halfV[SIZE*SIZE];\
2412 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2413 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2414 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2415 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2416 }\
2417 \
2418 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2419 int16_t tmp[SIZE*(SIZE+5)];\
2420 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2421 }\
2422 \
2423 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2424 int16_t tmp[SIZE*(SIZE+5)];\
2425 uint8_t halfH[SIZE*SIZE];\
2426 uint8_t halfHV[SIZE*SIZE];\
2427 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2429 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2430 }\
2431 \
2432 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2433 int16_t tmp[SIZE*(SIZE+5)];\
2434 uint8_t halfH[SIZE*SIZE];\
2435 uint8_t halfHV[SIZE*SIZE];\
2436 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2437 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2438 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2439 }\
2440 \
2441 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2442 uint8_t full[SIZE*(SIZE+5)];\
2443 uint8_t * const full_mid= full + SIZE*2;\
2444 int16_t tmp[SIZE*(SIZE+5)];\
2445 uint8_t halfV[SIZE*SIZE];\
2446 uint8_t halfHV[SIZE*SIZE];\
2447 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2448 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2449 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2450 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2451 }\
2452 \
2453 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2454 uint8_t full[SIZE*(SIZE+5)];\
2455 uint8_t * const full_mid= full + SIZE*2;\
2456 int16_t tmp[SIZE*(SIZE+5)];\
2457 uint8_t halfV[SIZE*SIZE];\
2458 uint8_t halfHV[SIZE*SIZE];\
2459 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2460 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2461 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2462 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2463 }\
2464
2465 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2466 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2467 #define op_put(a, b) a = cm[((b) + 16)>>5]
2468 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2469 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2470
2471 H264_LOWPASS(put_ , op_put, op2_put)
2472 H264_LOWPASS(avg_ , op_avg, op2_avg)
2473 H264_MC(put_, 2)
2474 H264_MC(put_, 4)
2475 H264_MC(put_, 8)
2476 H264_MC(put_, 16)
2477 H264_MC(avg_, 4)
2478 H264_MC(avg_, 8)
2479 H264_MC(avg_, 16)
2480
2481 #undef op_avg
2482 #undef op_put
2483 #undef op2_avg
2484 #undef op2_put
2485 #endif
2486
2487 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2488 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2489 #define H264_WEIGHT(W,H) \
2490 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2491 int y; \
2492 offset <<= log2_denom; \
2493 if(log2_denom) offset += 1<<(log2_denom-1); \
2494 for(y=0; y<H; y++, block += stride){ \
2495 op_scale1(0); \
2496 op_scale1(1); \
2497 if(W==2) continue; \
2498 op_scale1(2); \
2499 op_scale1(3); \
2500 if(W==4) continue; \
2501 op_scale1(4); \
2502 op_scale1(5); \
2503 op_scale1(6); \
2504 op_scale1(7); \
2505 if(W==8) continue; \
2506 op_scale1(8); \
2507 op_scale1(9); \
2508 op_scale1(10); \
2509 op_scale1(11); \
2510 op_scale1(12); \
2511 op_scale1(13); \
2512 op_scale1(14); \
2513 op_scale1(15); \
2514 } \
2515 } \
2516 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2517 int y; \
2518 offset = ((offset + 1) | 1) << log2_denom; \
2519 for(y=0; y<H; y++, dst += stride, src += stride){ \
2520 op_scale2(0); \
2521 op_scale2(1); \
2522 if(W==2) continue; \
2523 op_scale2(2); \
2524 op_scale2(3); \
2525 if(W==4) continue; \
2526 op_scale2(4); \
2527 op_scale2(5); \
2528 op_scale2(6); \
2529 op_scale2(7); \
2530 if(W==8) continue; \
2531 op_scale2(8); \
2532 op_scale2(9); \
2533 op_scale2(10); \
2534 op_scale2(11); \
2535 op_scale2(12); \
2536 op_scale2(13); \
2537 op_scale2(14); \
2538 op_scale2(15); \
2539 } \
2540 }
2541
2542 H264_WEIGHT(16,16)
2543 H264_WEIGHT(16,8)
2544 H264_WEIGHT(8,16)
2545 H264_WEIGHT(8,8)
2546 H264_WEIGHT(8,4)
2547 H264_WEIGHT(4,8)
2548 H264_WEIGHT(4,4)
2549 H264_WEIGHT(4,2)
2550 H264_WEIGHT(2,4)
2551 H264_WEIGHT(2,2)
2552
2553 #undef op_scale1
2554 #undef op_scale2
2555 #undef H264_WEIGHT
2556
2557 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2558 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2559 int i;
2560
2561 for(i=0; i<h; i++){
2562 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2563 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2564 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2565 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2566 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2567 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2568 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2569 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2570 dst+=dstStride;
2571 src+=srcStride;
2572 }
2573 }
2574
2575 #ifdef CONFIG_CAVS_DECODER
2576 /* AVS specific */
2577 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2578
2579 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2580 put_pixels8_c(dst, src, stride, 8);
2581 }
2582 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2583 avg_pixels8_c(dst, src, stride, 8);
2584 }
2585 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586 put_pixels16_c(dst, src, stride, 16);
2587 }
2588 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2589 avg_pixels16_c(dst, src, stride, 16);
2590 }
2591 #endif /* CONFIG_CAVS_DECODER */
2592
2593 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2594 /* VC-1 specific */
2595 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2596
2597 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2598 put_pixels8_c(dst, src, stride, 8);
2599 }
2600 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2601
2602 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2603
2604 /* H264 specific */
2605 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2606
2607 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2608 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2609 int i;
2610
2611 for(i=0; i<w; i++){
2612 const int src_1= src[ -srcStride];
2613 const int src0 = src[0 ];
2614 const int src1 = src[ srcStride];
2615 const int src2 = src[2*srcStride];
2616 const int src3 = src[3*srcStride];
2617 const int src4 = src[4*srcStride];
2618 const int src5 = src[5*srcStride];
2619 const int src6 = src[6*srcStride];
2620 const int src7 = src[7*srcStride];
2621 const int src8 = src[8*srcStride];
2622 const int src9 = src[9*srcStride];
2623 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2624 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2625 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2626 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2627 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2628 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2629 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2630 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2631 src++;
2632 dst++;
2633 }
2634 }
2635
2636 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2637 put_pixels8_c(dst, src, stride, 8);
2638 }
2639
2640 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2641 uint8_t half[64];
2642 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2643 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2644 }
2645
2646 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2647 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2648 }
2649
2650 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2651 uint8_t half[64];
2652 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2653 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2654 }
2655
2656 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2657 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2658 }
2659
2660 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2661 uint8_t halfH[88];
2662 uint8_t halfV[64];
2663 uint8_t halfHV[64];
2664 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2665 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2666 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2667 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2668 }
2669 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2670 uint8_t halfH[88];
2671 uint8_t halfV[64];
2672 uint8_t halfHV[64];
2673 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2674 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2675 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2676 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2677 }
2678 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2679 uint8_t halfH[88];
2680 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2681 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2682 }
2683
2684 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2685 if(ENABLE_ANY_H263) {
2686 int x;
2687 const int strength= ff_h263_loop_filter_strength[qscale];
2688
2689 for(x=0; x<8; x++){
2690 int d1, d2, ad1;
2691 int p0= src[x-2*stride];
2692 int p1= src[x-1*stride];
2693 int p2= src[x+0*stride];
2694 int p3= src[x+1*stride];
2695 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2696
2697 if (d<-2*strength) d1= 0;
2698 else if(d<- strength) d1=-2*strength - d;
2699 else if(d< strength) d1= d;
2700 else if(d< 2*strength) d1= 2*strength - d;
2701 else d1= 0;
2702
2703 p1 += d1;
2704 p2 -= d1;
2705 if(p1&256) p1= ~(p1>>31);
2706 if(p2&256) p2= ~(p2>>31);
2707
2708 src[x-1*stride] = p1;
2709 src[x+0*stride] = p2;
2710
2711 ad1= FFABS(d1)>>1;
2712
2713 d2= av_clip((p0-p3)/4, -ad1, ad1);
2714
2715 src[x-2*stride] = p0 - d2;
2716 src[x+ stride] = p3 + d2;
2717 }
2718 }
2719 }
2720
2721 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2722 if(ENABLE_ANY_H263) {
2723 int y;
2724 const int strength= ff_h263_loop_filter_strength[qscale];
2725
2726 for(y=0; y<8; y++){
2727 int d1, d2, ad1;
2728 int p0= src[y*stride-2];
2729 int p1= src[y*stride-1];
2730 int p2= src[y*stride+0];
2731 int p3= src[y*stride+1];
2732 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2733
2734 if (d<-2*strength) d1= 0;
2735 else if(d<- strength) d1=-2*strength - d;
2736 else if(d< strength) d1= d;
2737 else if(d< 2*strength) d1= 2*strength - d;
2738 else d1= 0;
2739
2740 p1 += d1;
2741 p2 -= d1;
2742 if(p1&256) p1= ~(p1>>31);
2743 if(p2&256) p2= ~(p2>>31);
2744
2745 src[y*stride-1] = p1;
2746 src[y*stride+0] = p2;
2747
2748 ad1= FFABS(d1)>>1;
2749
2750 d2= av_clip((p0-p3)/4, -ad1, ad1);
2751
2752 src[y*stride-2] = p0 - d2;
2753 src[y*stride+1] = p3 + d2;
2754 }
2755 }
2756 }
2757
2758 static void h261_loop_filter_c(uint8_t *src, int stride){
2759 int x,y,xy,yz;
2760 int temp[64];
2761
2762 for(x=0; x<8; x++){
2763 temp[x ] = 4*src[x ];
2764 temp[x + 7*8] = 4*src[x + 7*stride];
2765 }
2766 for(y=1; y<7; y++){
2767 for(x=0; x<8; x++){
2768 xy = y * stride + x;
2769 yz = y * 8 + x;
2770 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2771 }
2772 }
2773
2774 for(y=0; y<8; y++){
2775 src[ y*stride] = (temp[ y*8] + 2)>>2;
2776 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2777 for(x=1; x<7; x++){
2778 xy = y * stride + x;
2779 yz = y * 8 + x;
2780 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2781 }
2782 }
2783 }
2784
2785 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2786 {
2787 int i, d;
2788 for( i = 0; i < 4; i++ ) {
2789 if( tc0[i] < 0 ) {
2790 pix += 4*ystride;
2791 continue;
2792 }
2793 for( d = 0; d < 4; d++ ) {
2794 const int p0 = pix[-1*xstride];
2795 const int p1 = pix[-2*xstride];
2796 const int p2 = pix[-3*xstride];
2797 const int q0 = pix[0];
2798 const int q1 = pix[1*xstride];
2799 const int q2 = pix[2*xstride];
2800
2801 if( FFABS( p0 - q0 ) < alpha &&
2802 FFABS( p1 - p0 ) < beta &&
2803 FFABS( q1 - q0 ) < beta ) {
2804
2805 int tc = tc0[i];
2806 int i_delta;
2807
2808 if( FFABS( p2 - p0 ) < beta ) {
2809 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2810 tc++;
2811 }
2812 if( FFABS( q2 - q0 ) < beta ) {
2813 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2814 tc++;
2815 }
2816
2817 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2818 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2819 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2820 }
2821 pix += ystride;
2822 }
2823 }
2824 }
2825 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2826 {
2827 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2828 }
2829 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2830 {
2831 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2832 }
2833
2834 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2835 {
2836 int i, d;
2837 for( i = 0; i < 4; i++ ) {
2838 const int tc = tc0[i];
2839 if( tc <= 0 ) {
2840 pix += 2*ystride;
2841 continue;
2842 }
2843 for( d = 0; d < 2; d++ ) {
2844 const int p0 = pix[-1*xstride];
2845 const int p1 = pix[-2*xstride];
2846 const int q0 = pix[0];
2847 const int q1 = pix[1*xstride];
2848
2849 if( FFABS( p0 - q0 ) < alpha &&
2850 FFABS( p1 - p0 ) < beta &&
2851 FFABS( q1 - q0 ) < beta ) {
2852
2853 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2854
2855 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2856 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2857 }
2858 pix += ystride;
2859 }
2860 }
2861 }
2862 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2863 {
2864 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2865 }
2866 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2867 {
2868 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2869 }
2870
2871 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2872 {
2873 int d;
2874 for( d = 0; d < 8; d++ ) {
2875 const int p0 = pix[-1*xstride];
2876 const int p1 = pix[-2*xstride];
2877 const int q0 = pix[0];
2878 const int q1 = pix[1*xstride];
2879
2880 if( FFABS( p0 - q0 ) < alpha &&
2881 FFABS( p1 - p0 ) < beta &&
2882 FFABS( q1 - q0 ) < beta ) {
2883
2884 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2885 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2886 }
2887 pix += ystride;
2888 }
2889 }
2890 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2891 {
2892 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2893 }
2894 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2895 {
2896 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2897 }
2898
2899 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2900 {
2901 int s, i;
2902
2903 s = 0;
2904 for(i=0;i<h;i++) {
2905 s += abs(pix1[0] - pix2[0]);
2906 s += abs(pix1[1] - pix2[1]);
2907 s += abs(pix1[2] - pix2[2]);
2908 s += abs(pix1[3] - pix2[3]);
2909 s += abs(pix1[4] - pix2[4]);
2910 s += abs(pix1[5] - pix2[5]);
2911 s += abs(pix1[6] - pix2[6]);
2912 s += abs(pix1[7] - pix2[7]);
2913 s += abs(pix1[8] - pix2[8]);
2914 s += abs(pix1[9] - pix2[9]);
2915 s += abs(pix1[10] - pix2[10]);
2916 s += abs(pix1[11] - pix2[11]);
2917 s += abs(pix1[12] - pix2[12]);
2918 s += abs(pix1[13] - pix2[13]);
2919 s += abs(pix1[14] - pix2[14]);
2920 s += abs(pix1[15] - pix2[15]);
2921 pix1 += line_size;
2922 pix2 += line_size;
2923 }
2924 return s;
2925 }
2926
2927 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2928 {
2929 int s, i;
2930
2931 s = 0;
2932 for(i=0;i<h;i++) {
2933 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2934 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2935 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2936 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2937 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2938 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2939 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2940 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2941 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2942 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2943 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2944 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2945 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2946 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2947 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2948 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2949 pix1 += line_size;
2950 pix2 += line_size;
2951 }
2952 return s;
2953 }
2954
2955 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2956 {
2957 int s, i;
2958 uint8_t *pix3 = pix2 + line_size;
2959
2960 s = 0;
2961 for(i=0;i<h;i++) {
2962 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2963 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2964 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2965 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2966 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2967 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2968 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2969 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2970 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2971 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2972 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2973 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2974 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2975 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2976 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2977 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2978 pix1 += line_size;
2979 pix2 += line_size;
2980 pix3 += line_size;
2981 }
2982 return s;
2983 }
2984
2985 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2986 {
2987 int s, i;
2988 uint8_t *pix3 = pix2 + line_size;
2989
2990 s = 0;
2991 for(i=0;i<h;i++) {
2992 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2993 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2994 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2995 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2996 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2997 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2998 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2999 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3000 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3001 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3002 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3003 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3004 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3005 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3006 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3007 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3008 pix1 += line_size;
3009 pix2 += line_size;
3010 pix3 += line_size;
3011 }
3012 return s;
3013 }
3014
3015 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3016 {
3017 int s, i;
3018
3019 s = 0;
3020 for(i=0;i<h;i++) {
3021 s += abs(pix1[0] - pix2[0]);
3022 s += abs(pix1[1] - pix2[1]);
3023 s += abs(pix1[2] - pix2[2]);
3024 s += abs(pix1[3] - pix2[3]);
3025 s += abs(pix1[4] - pix2[4]);
3026 s += abs(pix1[5] - pix2[5]);
3027 s += abs(pix1[6] - pix2[6]);
3028 s += abs(pix1[7] - pix2[7]);
3029 pix1 += line_size;
3030 pix2 += line_size;
3031 }
3032 return s;
3033 }
3034
3035 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3036 {
3037 int s, i;
3038
3039 s = 0;
3040 for(i=0;i<h;i++) {
3041 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3042 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3043 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3044 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3045 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3046 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3047 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3048 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3049 pix1 += line_size;
3050 pix2 += line_size;
3051 }
3052 return s;
3053 }
3054
3055 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3056 {
3057 int s, i;
3058 uint8_t *pix3 = pix2 + line_size;
3059
3060 s = 0;
3061 for(i=0;i<h;i++) {
3062 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3063 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3064 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3065 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3066 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3067 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3068 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3069 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3070 pix1 += line_size;
3071 pix2 += line_size;
3072 pix3 += line_size;
3073 }
3074 return s;
3075 }
3076
3077 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3078 {
3079 int s, i;
3080 uint8_t *pix3 = pix2 + line_size;
3081
3082 s = 0;
3083 for(i=0;i<h;i++) {
3084 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3085 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3086 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3087 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3088 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3089 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3090 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3091 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3092 pix1 += line_size;
3093 pix2 += line_size;
3094 pix3 += line_size;
3095 }
3096 return s;
3097 }
3098
3099 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3100 MpegEncContext *c = v;
3101 int score1=0;
3102 int score2=0;
3103 int x,y;
3104
3105 for(y=0; y<h; y++){
3106 for(x=0; x<16; x++){
3107 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3108 }
3109 if(y+1<h){
3110 for(x=0; x<15; x++){
3111 score2+= FFABS( s1[x ] - s1[x +stride]
3112 - s1[x+1] + s1[x+1+stride])
3113 -FFABS( s2[x ] - s2[x +stride]
3114 - s2[x+1] + s2[x+1+stride]);
3115 }
3116 }
3117 s1+= stride;
3118 s2+= stride;
3119 }
3120
3121 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3122 else return score1 + FFABS(score2)*8;
3123 }
3124
3125 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3126 MpegEncContext *c = v;
3127 int score1=0;
3128 int score2=0;
3129 int x,y;
3130
3131 for(y=0; y<h; y++){
3132 for(x=0; x<8; x++){
3133 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3134 }
3135 if(y+1<h){
3136 for(x=0; x<7; x++){
3137 score2+= FFABS( s1[x ] - s1[x +stride]
3138 - s1[x+1] + s1[x+1+stride])
3139 -FFABS( s2[x ] - s2[x +stride]
3140 - s2[x+1] + s2[x+1+stride]);
3141 }
3142 }
3143 s1+= stride;
3144 s2+= stride;
3145 }
3146
3147 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3148 else return score1 + FFABS(score2)*8;
3149 }
3150
3151 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3152 int i;
3153 unsigned int sum=0;
3154
3155 for(i=0; i<8*8; i++){
3156 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3157 int w= weight[i];
3158 b>>= RECON_SHIFT;
3159 assert(-512<b && b<512);
3160
3161 sum += (w*b)*(w*b)>>4;
3162 }
3163 return sum>>2;
3164 }
3165
3166 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3167 int i;
3168
3169 for(i=0; i<8*8; i++){
3170 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3171 }
3172 }
3173
3174 /**
3175 * permutes an 8x8 block.
3176 * @param block the block which will be permuted according to the given permutation vector
3177 * @param permutation the permutation vector
3178 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3179 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3180 * (inverse) permutated to scantable order!
3181 */
3182 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3183 {
3184 int i;
3185 DCTELEM temp[64];
3186
3187 if(last<=0) return;
3188 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3189
3190 for(i=0; i<=last; i++){
3191 const int j= scantable[i];
3192 temp[j]= block[j];
3193 block[j]=0;
3194 }
3195
3196 for(i=0; i<=last; i++){
3197 const int j= scantable[i];
3198 const int perm_j= permutation[j];
3199 block[perm_j]= temp[j];
3200 }
3201 }
3202
3203 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3204 return 0;
3205 }
3206
3207 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3208 int i;
3209
3210 memset(cmp, 0, sizeof(void*)*5);
3211
3212 for(i=0; i<5; i++){
3213 switch(type&0xFF){
3214 case FF_CMP_SAD:
3215 cmp[i]= c->sad[i];
3216 break;
3217 case FF_CMP_SATD:
3218 cmp[i]= c->hadamard8_diff[i];
3219 break;
3220 case FF_CMP_SSE:
3221 cmp[i]= c->sse[i];
3222 break;
3223 case FF_CMP_DCT:
3224 cmp[i]= c->dct_sad[i];
3225 break;
3226 case FF_CMP_DCT264:
3227 cmp[i]= c->dct264_sad[i];
3228 break;
3229 case FF_CMP_DCTMAX:
3230 cmp[i]= c->dct_max[i];
3231 break;
3232 case FF_CMP_PSNR:
3233 cmp[i]= c->quant_psnr[i];
3234 break;
3235 case FF_CMP_BIT:
3236 cmp[i]= c->bit[i];
3237 break;
3238 case FF_CMP_RD:
3239 cmp[i]= c->rd[i];
3240 break;
3241 case FF_CMP_VSAD:
3242 cmp[i]= c->vsad[i];
3243 break;
3244 case FF_CMP_VSSE:
3245 cmp[i]= c->vsse[i];
3246 break;
3247 case FF_CMP_ZERO:
3248 cmp[i]= zero_cmp;
3249 break;
3250 case FF_CMP_NSSE:
3251 cmp[i]= c->nsse[i];
3252 break;
3253 #ifdef CONFIG_SNOW_ENCODER
3254 case FF_CMP_W53:
3255 cmp[i]= c->w53[i];
3256 break;
3257 case FF_CMP_W97:
3258 cmp[i]= c->w97[i];
3259 break;
3260 #endif
3261 default:
3262 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3263 }
3264 }
3265 }
3266
3267 /**
3268 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3269 */
3270 static void clear_blocks_c(DCTELEM *blocks)
3271 {
3272 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3273 }
3274
3275 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3276 int i;
3277 for(i=0; i+7<w; i+=8){
3278 dst[i+0] += src[i+0];
3279 dst[i+1] += src[i+1];
3280 dst[i+2] += src[i+2];
3281 dst[i+3] += src[i+3];
3282 dst[i+4] += src[i+4];
3283 dst[i+5] += src[i+5];
3284 dst[i+6] += src[i+6];
3285 dst[i+7] += src[i+7];
3286 }
3287 for(; i<w; i++)
3288 dst[i+0] += src[i+0];
3289 }
3290
3291 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3292 int i;
3293 for(i=0; i+7<w; i+=8){
3294 dst[i+0] = src1[i+0]-src2[i+0];
3295 dst[i+1] = src1[i+1]-src2[i+1];
3296 dst[i+2] = src1[i+2]-src2[i+2];
3297 dst[i+3] = src1[i+3]-src2[i+3];
3298 dst[i+4] = src1[i+4]-src2[i+4];
3299 dst[i+5] = src1[i+5]-src2[i+5];
3300 dst[i+6] = src1[i+6]-src2[i+6];
3301 dst[i+7] = src1[i+7]-src2[i+7];
3302 }
3303 for(; i<w; i++)
3304 dst[i+0] = src1[i+0]-src2[i+0];
3305 }
3306
3307 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3308 int i;
3309 uint8_t l, lt;
3310
3311 l= *left;
3312 lt= *left_top;
3313
3314 for(i=0; i<w; i++){
3315 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3316 lt= src1[i];
3317 l= src2[i];
3318 dst[i]= l - pred;
3319 }
3320
3321 *left= l;
3322 *left_top= lt;
3323 }
3324
3325 #define BUTTERFLY2(o1,o2,i1,i2) \
3326 o1= (i1)+(i2);\
3327 o2= (i1)-(i2);
3328
3329 #define BUTTERFLY1(x,y) \
3330 {\
3331 int a,b;\
3332 a= x;\
3333 b= y;\
3334 x= a+b;\
3335 y= a-b;\
3336 }
3337
3338 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3339
3340 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3341 int i;
3342 int temp[64];
3343 int sum=0;
3344
3345 assert(h==8);
3346
3347 for(i=0; i<8; i++){
3348 //FIXME try pointer walks
3349 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3350 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3351 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3352 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3353
3354 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3355 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3356 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3357 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3358
3359 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3360 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3361 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3362 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3363 }
3364
3365 for(i=0; i<8; i++){
3366 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3367 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3368 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3369 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3370
3371 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3372 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3373 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3374 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3375
3376 sum +=
3377 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3378 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3379 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3380 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3381 }
3382 #if 0
3383 static int maxi=0;
3384 if(sum>maxi){
3385 maxi=sum;
3386 printf("MAX:%d\n", maxi);
3387 }
3388 #endif
3389 return sum;
3390 }
3391
3392 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3393 int i;
3394 int temp[64];
3395 int sum=0;
3396
3397 assert(h==8);
3398
3399 for(i=0; i<8; i++){
3400 //FIXME try pointer walks
3401 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3402 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3403 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3404 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3405
3406 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3407 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3408 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3409 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3410
3411 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3412 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3413 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3414 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3415 }
3416
3417 for(i=0; i<8; i++){
3418 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3419 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3420 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3421 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3422
3423 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3424 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3425 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3426 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3427
3428 sum +=
3429 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3430 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3431 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3432 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3433 }
3434
3435 sum -=