3e7e43ac0b04ec631534d5dfc980b4de3912fc02
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /**
26 * @file libavcodec/dsputil.c
27 * DSP utils
28 */
29
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "mathops.h"
36 #include "h263.h"
37 #include "snow.h"
38
39 /* snow.c */
40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41
42 /* vorbis.c */
43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44
45 /* ac3dec.c */
46 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
47
48 /* flacenc.c */
49 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
50
51 /* pngdec.c */
52 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
53
54 /* eaidct.c */
55 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
56
57 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
58 uint32_t ff_squareTbl[512] = {0, };
59
60 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
61 #define pb_7f (~0UL/255 * 0x7f)
62 #define pb_80 (~0UL/255 * 0x80)
63
64 const uint8_t ff_zigzag_direct[64] = {
65 0, 1, 8, 16, 9, 2, 3, 10,
66 17, 24, 32, 25, 18, 11, 4, 5,
67 12, 19, 26, 33, 40, 48, 41, 34,
68 27, 20, 13, 6, 7, 14, 21, 28,
69 35, 42, 49, 56, 57, 50, 43, 36,
70 29, 22, 15, 23, 30, 37, 44, 51,
71 58, 59, 52, 45, 38, 31, 39, 46,
72 53, 60, 61, 54, 47, 55, 62, 63
73 };
74
75 /* Specific zigzag scan for 248 idct. NOTE that unlike the
76 specification, we interleave the fields */
77 const uint8_t ff_zigzag248_direct[64] = {
78 0, 8, 1, 9, 16, 24, 2, 10,
79 17, 25, 32, 40, 48, 56, 33, 41,
80 18, 26, 3, 11, 4, 12, 19, 27,
81 34, 42, 49, 57, 50, 58, 35, 43,
82 20, 28, 5, 13, 6, 14, 21, 29,
83 36, 44, 51, 59, 52, 60, 37, 45,
84 22, 30, 7, 15, 23, 31, 38, 46,
85 53, 61, 54, 62, 39, 47, 55, 63,
86 };
87
88 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
89 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
90
91 const uint8_t ff_alternate_horizontal_scan[64] = {
92 0, 1, 2, 3, 8, 9, 16, 17,
93 10, 11, 4, 5, 6, 7, 15, 14,
94 13, 12, 19, 18, 24, 25, 32, 33,
95 26, 27, 20, 21, 22, 23, 28, 29,
96 30, 31, 34, 35, 40, 41, 48, 49,
97 42, 43, 36, 37, 38, 39, 44, 45,
98 46, 47, 50, 51, 56, 57, 58, 59,
99 52, 53, 54, 55, 60, 61, 62, 63,
100 };
101
102 const uint8_t ff_alternate_vertical_scan[64] = {
103 0, 8, 16, 24, 1, 9, 2, 10,
104 17, 25, 32, 40, 48, 56, 57, 49,
105 41, 33, 26, 18, 3, 11, 4, 12,
106 19, 27, 34, 42, 50, 58, 35, 43,
107 51, 59, 20, 28, 5, 13, 6, 14,
108 21, 29, 36, 44, 52, 60, 37, 45,
109 53, 61, 22, 30, 7, 15, 23, 31,
110 38, 46, 54, 62, 39, 47, 55, 63,
111 };
112
113 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
114 const uint32_t ff_inverse[256]={
115 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
116 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
117 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
118 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
119 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
120 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
121 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
122 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
123 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
124 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
125 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
126 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
127 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
128 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
129 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
130 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
131 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
132 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
133 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
134 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
135 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
136 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
137 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
138 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
139 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
140 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
141 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
142 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
143 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
144 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
145 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
146 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
147 };
148
149 /* Input permutation for the simple_idct_mmx */
150 static const uint8_t simple_mmx_permutation[64]={
151 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
159 };
160
161 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162
163 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
164 int i;
165 int end;
166
167 st->scantable= src_scantable;
168
169 for(i=0; i<64; i++){
170 int j;
171 j = src_scantable[i];
172 st->permutated[i] = permutation[j];
173 #if ARCH_PPC
174 st->inverse[j] = i;
175 #endif
176 }
177
178 end=-1;
179 for(i=0; i<64; i++){
180 int j;
181 j = st->permutated[i];
182 if(j>end) end=j;
183 st->raster_end[i]= end;
184 }
185 }
186
187 static int pix_sum_c(uint8_t * pix, int line_size)
188 {
189 int s, i, j;
190
191 s = 0;
192 for (i = 0; i < 16; i++) {
193 for (j = 0; j < 16; j += 8) {
194 s += pix[0];
195 s += pix[1];
196 s += pix[2];
197 s += pix[3];
198 s += pix[4];
199 s += pix[5];
200 s += pix[6];
201 s += pix[7];
202 pix += 8;
203 }
204 pix += line_size - 16;
205 }
206 return s;
207 }
208
209 static int pix_norm1_c(uint8_t * pix, int line_size)
210 {
211 int s, i, j;
212 uint32_t *sq = ff_squareTbl + 256;
213
214 s = 0;
215 for (i = 0; i < 16; i++) {
216 for (j = 0; j < 16; j += 8) {
217 #if 0
218 s += sq[pix[0]];
219 s += sq[pix[1]];
220 s += sq[pix[2]];
221 s += sq[pix[3]];
222 s += sq[pix[4]];
223 s += sq[pix[5]];
224 s += sq[pix[6]];
225 s += sq[pix[7]];
226 #else
227 #if LONG_MAX > 2147483647
228 register uint64_t x=*(uint64_t*)pix;
229 s += sq[x&0xff];
230 s += sq[(x>>8)&0xff];
231 s += sq[(x>>16)&0xff];
232 s += sq[(x>>24)&0xff];
233 s += sq[(x>>32)&0xff];
234 s += sq[(x>>40)&0xff];
235 s += sq[(x>>48)&0xff];
236 s += sq[(x>>56)&0xff];
237 #else
238 register uint32_t x=*(uint32_t*)pix;
239 s += sq[x&0xff];
240 s += sq[(x>>8)&0xff];
241 s += sq[(x>>16)&0xff];
242 s += sq[(x>>24)&0xff];
243 x=*(uint32_t*)(pix+4);
244 s += sq[x&0xff];
245 s += sq[(x>>8)&0xff];
246 s += sq[(x>>16)&0xff];
247 s += sq[(x>>24)&0xff];
248 #endif
249 #endif
250 pix += 8;
251 }
252 pix += line_size - 16;
253 }
254 return s;
255 }
256
257 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
258 int i;
259
260 for(i=0; i+8<=w; i+=8){
261 dst[i+0]= bswap_32(src[i+0]);
262 dst[i+1]= bswap_32(src[i+1]);
263 dst[i+2]= bswap_32(src[i+2]);
264 dst[i+3]= bswap_32(src[i+3]);
265 dst[i+4]= bswap_32(src[i+4]);
266 dst[i+5]= bswap_32(src[i+5]);
267 dst[i+6]= bswap_32(src[i+6]);
268 dst[i+7]= bswap_32(src[i+7]);
269 }
270 for(;i<w; i++){
271 dst[i+0]= bswap_32(src[i+0]);
272 }
273 }
274
275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
276 {
277 int s, i;
278 uint32_t *sq = ff_squareTbl + 256;
279
280 s = 0;
281 for (i = 0; i < h; i++) {
282 s += sq[pix1[0] - pix2[0]];
283 s += sq[pix1[1] - pix2[1]];
284 s += sq[pix1[2] - pix2[2]];
285 s += sq[pix1[3] - pix2[3]];
286 pix1 += line_size;
287 pix2 += line_size;
288 }
289 return s;
290 }
291
292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
293 {
294 int s, i;
295 uint32_t *sq = ff_squareTbl + 256;
296
297 s = 0;
298 for (i = 0; i < h; i++) {
299 s += sq[pix1[0] - pix2[0]];
300 s += sq[pix1[1] - pix2[1]];
301 s += sq[pix1[2] - pix2[2]];
302 s += sq[pix1[3] - pix2[3]];
303 s += sq[pix1[4] - pix2[4]];
304 s += sq[pix1[5] - pix2[5]];
305 s += sq[pix1[6] - pix2[6]];
306 s += sq[pix1[7] - pix2[7]];
307 pix1 += line_size;
308 pix2 += line_size;
309 }
310 return s;
311 }
312
313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
314 {
315 int s, i;
316 uint32_t *sq = ff_squareTbl + 256;
317
318 s = 0;
319 for (i = 0; i < h; i++) {
320 s += sq[pix1[ 0] - pix2[ 0]];
321 s += sq[pix1[ 1] - pix2[ 1]];
322 s += sq[pix1[ 2] - pix2[ 2]];
323 s += sq[pix1[ 3] - pix2[ 3]];
324 s += sq[pix1[ 4] - pix2[ 4]];
325 s += sq[pix1[ 5] - pix2[ 5]];
326 s += sq[pix1[ 6] - pix2[ 6]];
327 s += sq[pix1[ 7] - pix2[ 7]];
328 s += sq[pix1[ 8] - pix2[ 8]];
329 s += sq[pix1[ 9] - pix2[ 9]];
330 s += sq[pix1[10] - pix2[10]];
331 s += sq[pix1[11] - pix2[11]];
332 s += sq[pix1[12] - pix2[12]];
333 s += sq[pix1[13] - pix2[13]];
334 s += sq[pix1[14] - pix2[14]];
335 s += sq[pix1[15] - pix2[15]];
336
337 pix1 += line_size;
338 pix2 += line_size;
339 }
340 return s;
341 }
342
343
344 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
345 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
346 int s, i, j;
347 const int dec_count= w==8 ? 3 : 4;
348 int tmp[32*32];
349 int level, ori;
350 static const int scale[2][2][4][4]={
351 {
352 {
353 // 9/7 8x8 dec=3
354 {268, 239, 239, 213},
355 { 0, 224, 224, 152},
356 { 0, 135, 135, 110},
357 },{
358 // 9/7 16x16 or 32x32 dec=4
359 {344, 310, 310, 280},
360 { 0, 320, 320, 228},
361 { 0, 175, 175, 136},
362 { 0, 129, 129, 102},
363 }
364 },{
365 {
366 // 5/3 8x8 dec=3
367 {275, 245, 245, 218},
368 { 0, 230, 230, 156},
369 { 0, 138, 138, 113},
370 },{
371 // 5/3 16x16 or 32x32 dec=4
372 {352, 317, 317, 286},
373 { 0, 328, 328, 233},
374 { 0, 180, 180, 140},
375 { 0, 132, 132, 105},
376 }
377 }
378 };
379
380 for (i = 0; i < h; i++) {
381 for (j = 0; j < w; j+=4) {
382 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
383 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
384 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
385 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
386 }
387 pix1 += line_size;
388 pix2 += line_size;
389 }
390
391 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
392
393 s=0;
394 assert(w==h);
395 for(level=0; level<dec_count; level++){
396 for(ori= level ? 1 : 0; ori<4; ori++){
397 int size= w>>(dec_count-level);
398 int sx= (ori&1) ? size : 0;
399 int stride= 32<<(dec_count-level);
400 int sy= (ori&2) ? stride>>1 : 0;
401
402 for(i=0; i<size; i++){
403 for(j=0; j<size; j++){
404 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
405 s += FFABS(v);
406 }
407 }
408 }
409 }
410 assert(s>=0);
411 return s>>9;
412 }
413
414 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415 return w_c(v, pix1, pix2, line_size, 8, h, 1);
416 }
417
418 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419 return w_c(v, pix1, pix2, line_size, 8, h, 0);
420 }
421
422 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423 return w_c(v, pix1, pix2, line_size, 16, h, 1);
424 }
425
426 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427 return w_c(v, pix1, pix2, line_size, 16, h, 0);
428 }
429
430 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431 return w_c(v, pix1, pix2, line_size, 32, h, 1);
432 }
433
434 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435 return w_c(v, pix1, pix2, line_size, 32, h, 0);
436 }
437 #endif
438
439 /* draw the edges of width 'w' of an image of size width, height */
440 //FIXME check that this is ok for mpeg4 interlaced
441 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
442 {
443 uint8_t *ptr, *last_line;
444 int i;
445
446 last_line = buf + (height - 1) * wrap;
447 for(i=0;i<w;i++) {
448 /* top and bottom */
449 memcpy(buf - (i + 1) * wrap, buf, width);
450 memcpy(last_line + (i + 1) * wrap, last_line, width);
451 }
452 /* left and right */
453 ptr = buf;
454 for(i=0;i<height;i++) {
455 memset(ptr - w, ptr[0], w);
456 memset(ptr + width, ptr[width-1], w);
457 ptr += wrap;
458 }
459 /* corners */
460 for(i=0;i<w;i++) {
461 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
462 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
463 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
464 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
465 }
466 }
467
468 /**
469 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
470 * @param buf destination buffer
471 * @param src source buffer
472 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
473 * @param block_w width of block
474 * @param block_h height of block
475 * @param src_x x coordinate of the top left sample of the block in the source buffer
476 * @param src_y y coordinate of the top left sample of the block in the source buffer
477 * @param w width of the source buffer
478 * @param h height of the source buffer
479 */
480 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
481 int src_x, int src_y, int w, int h){
482 int x, y;
483 int start_y, start_x, end_y, end_x;
484
485 if(src_y>= h){
486 src+= (h-1-src_y)*linesize;
487 src_y=h-1;
488 }else if(src_y<=-block_h){
489 src+= (1-block_h-src_y)*linesize;
490 src_y=1-block_h;
491 }
492 if(src_x>= w){
493 src+= (w-1-src_x);
494 src_x=w-1;
495 }else if(src_x<=-block_w){
496 src+= (1-block_w-src_x);
497 src_x=1-block_w;
498 }
499
500 start_y= FFMAX(0, -src_y);
501 start_x= FFMAX(0, -src_x);
502 end_y= FFMIN(block_h, h-src_y);
503 end_x= FFMIN(block_w, w-src_x);
504
505 // copy existing part
506 for(y=start_y; y<end_y; y++){
507 for(x=start_x; x<end_x; x++){
508 buf[x + y*linesize]= src[x + y*linesize];
509 }
510 }
511
512 //top
513 for(y=0; y<start_y; y++){
514 for(x=start_x; x<end_x; x++){
515 buf[x + y*linesize]= buf[x + start_y*linesize];
516 }
517 }
518
519 //bottom
520 for(y=end_y; y<block_h; y++){
521 for(x=start_x; x<end_x; x++){
522 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
523 }
524 }
525
526 for(y=0; y<block_h; y++){
527 //left
528 for(x=0; x<start_x; x++){
529 buf[x + y*linesize]= buf[start_x + y*linesize];
530 }
531
532 //right
533 for(x=end_x; x<block_w; x++){
534 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
535 }
536 }
537 }
538
539 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
540 {
541 int i;
542
543 /* read the pixels */
544 for(i=0;i<8;i++) {
545 block[0] = pixels[0];
546 block[1] = pixels[1];
547 block[2] = pixels[2];
548 block[3] = pixels[3];
549 block[4] = pixels[4];
550 block[5] = pixels[5];
551 block[6] = pixels[6];
552 block[7] = pixels[7];
553 pixels += line_size;
554 block += 8;
555 }
556 }
557
558 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
559 const uint8_t *s2, int stride){
560 int i;
561
562 /* read the pixels */
563 for(i=0;i<8;i++) {
564 block[0] = s1[0] - s2[0];
565 block[1] = s1[1] - s2[1];
566 block[2] = s1[2] - s2[2];
567 block[3] = s1[3] - s2[3];
568 block[4] = s1[4] - s2[4];
569 block[5] = s1[5] - s2[5];
570 block[6] = s1[6] - s2[6];
571 block[7] = s1[7] - s2[7];
572 s1 += stride;
573 s2 += stride;
574 block += 8;
575 }
576 }
577
578
579 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
580 int line_size)
581 {
582 int i;
583 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
584
585 /* read the pixels */
586 for(i=0;i<8;i++) {
587 pixels[0] = cm[block[0]];
588 pixels[1] = cm[block[1]];
589 pixels[2] = cm[block[2]];
590 pixels[3] = cm[block[3]];
591 pixels[4] = cm[block[4]];
592 pixels[5] = cm[block[5]];
593 pixels[6] = cm[block[6]];
594 pixels[7] = cm[block[7]];
595
596 pixels += line_size;
597 block += 8;
598 }
599 }
600
601 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
602 int line_size)
603 {
604 int i;
605 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
606
607 /* read the pixels */
608 for(i=0;i<4;i++) {
609 pixels[0] = cm[block[0]];
610 pixels[1] = cm[block[1]];
611 pixels[2] = cm[block[2]];
612 pixels[3] = cm[block[3]];
613
614 pixels += line_size;
615 block += 8;
616 }
617 }
618
619 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
620 int line_size)
621 {
622 int i;
623 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
624
625 /* read the pixels */
626 for(i=0;i<2;i++) {
627 pixels[0] = cm[block[0]];
628 pixels[1] = cm[block[1]];
629
630 pixels += line_size;
631 block += 8;
632 }
633 }
634
635 static void put_signed_pixels_clamped_c(const DCTELEM *block,
636 uint8_t *restrict pixels,
637 int line_size)
638 {
639 int i, j;
640
641 for (i = 0; i < 8; i++) {
642 for (j = 0; j < 8; j++) {
643 if (*block < -128)
644 *pixels = 0;
645 else if (*block > 127)
646 *pixels = 255;
647 else
648 *pixels = (uint8_t)(*block + 128);
649 block++;
650 pixels++;
651 }
652 pixels += (line_size - 8);
653 }
654 }
655
656 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
657 int line_size)
658 {
659 int i;
660 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
661
662 /* read the pixels */
663 for(i=0;i<8;i++) {
664 pixels[0] = cm[pixels[0] + block[0]];
665 pixels[1] = cm[pixels[1] + block[1]];
666 pixels[2] = cm[pixels[2] + block[2]];
667 pixels[3] = cm[pixels[3] + block[3]];
668 pixels[4] = cm[pixels[4] + block[4]];
669 pixels[5] = cm[pixels[5] + block[5]];
670 pixels[6] = cm[pixels[6] + block[6]];
671 pixels[7] = cm[pixels[7] + block[7]];
672 pixels += line_size;
673 block += 8;
674 }
675 }
676
677 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
678 int line_size)
679 {
680 int i;
681 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
682
683 /* read the pixels */
684 for(i=0;i<4;i++) {
685 pixels[0] = cm[pixels[0] + block[0]];
686 pixels[1] = cm[pixels[1] + block[1]];
687 pixels[2] = cm[pixels[2] + block[2]];
688 pixels[3] = cm[pixels[3] + block[3]];
689 pixels += line_size;
690 block += 8;
691 }
692 }
693
694 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
695 int line_size)
696 {
697 int i;
698 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
699
700 /* read the pixels */
701 for(i=0;i<2;i++) {
702 pixels[0] = cm[pixels[0] + block[0]];
703 pixels[1] = cm[pixels[1] + block[1]];
704 pixels += line_size;
705 block += 8;
706 }
707 }
708
709 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
710 {
711 int i;
712 for(i=0;i<8;i++) {
713 pixels[0] += block[0];
714 pixels[1] += block[1];
715 pixels[2] += block[2];
716 pixels[3] += block[3];
717 pixels[4] += block[4];
718 pixels[5] += block[5];
719 pixels[6] += block[6];
720 pixels[7] += block[7];
721 pixels += line_size;
722 block += 8;
723 }
724 }
725
726 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
727 {
728 int i;
729 for(i=0;i<4;i++) {
730 pixels[0] += block[0];
731 pixels[1] += block[1];
732 pixels[2] += block[2];
733 pixels[3] += block[3];
734 pixels += line_size;
735 block += 4;
736 }
737 }
738
739 static int sum_abs_dctelem_c(DCTELEM *block)
740 {
741 int sum=0, i;
742 for(i=0; i<64; i++)
743 sum+= FFABS(block[i]);
744 return sum;
745 }
746
747 #if 0
748
749 #define PIXOP2(OPNAME, OP) \
750 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
751 {\
752 int i;\
753 for(i=0; i<h; i++){\
754 OP(*((uint64_t*)block), AV_RN64(pixels));\
755 pixels+=line_size;\
756 block +=line_size;\
757 }\
758 }\
759 \
760 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
761 {\
762 int i;\
763 for(i=0; i<h; i++){\
764 const uint64_t a= AV_RN64(pixels );\
765 const uint64_t b= AV_RN64(pixels+1);\
766 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
767 pixels+=line_size;\
768 block +=line_size;\
769 }\
770 }\
771 \
772 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
773 {\
774 int i;\
775 for(i=0; i<h; i++){\
776 const uint64_t a= AV_RN64(pixels );\
777 const uint64_t b= AV_RN64(pixels+1);\
778 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
779 pixels+=line_size;\
780 block +=line_size;\
781 }\
782 }\
783 \
784 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
785 {\
786 int i;\
787 for(i=0; i<h; i++){\
788 const uint64_t a= AV_RN64(pixels );\
789 const uint64_t b= AV_RN64(pixels+line_size);\
790 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
791 pixels+=line_size;\
792 block +=line_size;\
793 }\
794 }\
795 \
796 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
797 {\
798 int i;\
799 for(i=0; i<h; i++){\
800 const uint64_t a= AV_RN64(pixels );\
801 const uint64_t b= AV_RN64(pixels+line_size);\
802 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
803 pixels+=line_size;\
804 block +=line_size;\
805 }\
806 }\
807 \
808 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
809 {\
810 int i;\
811 const uint64_t a= AV_RN64(pixels );\
812 const uint64_t b= AV_RN64(pixels+1);\
813 uint64_t l0= (a&0x0303030303030303ULL)\
814 + (b&0x0303030303030303ULL)\
815 + 0x0202020202020202ULL;\
816 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
817 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
818 uint64_t l1,h1;\
819 \
820 pixels+=line_size;\
821 for(i=0; i<h; i+=2){\
822 uint64_t a= AV_RN64(pixels );\
823 uint64_t b= AV_RN64(pixels+1);\
824 l1= (a&0x0303030303030303ULL)\
825 + (b&0x0303030303030303ULL);\
826 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
827 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
828 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
829 pixels+=line_size;\
830 block +=line_size;\
831 a= AV_RN64(pixels );\
832 b= AV_RN64(pixels+1);\
833 l0= (a&0x0303030303030303ULL)\
834 + (b&0x0303030303030303ULL)\
835 + 0x0202020202020202ULL;\
836 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
837 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
838 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
839 pixels+=line_size;\
840 block +=line_size;\
841 }\
842 }\
843 \
844 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
845 {\
846 int i;\
847 const uint64_t a= AV_RN64(pixels );\
848 const uint64_t b= AV_RN64(pixels+1);\
849 uint64_t l0= (a&0x0303030303030303ULL)\
850 + (b&0x0303030303030303ULL)\
851 + 0x0101010101010101ULL;\
852 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
853 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
854 uint64_t l1,h1;\
855 \
856 pixels+=line_size;\
857 for(i=0; i<h; i+=2){\
858 uint64_t a= AV_RN64(pixels );\
859 uint64_t b= AV_RN64(pixels+1);\
860 l1= (a&0x0303030303030303ULL)\
861 + (b&0x0303030303030303ULL);\
862 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
863 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
864 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
865 pixels+=line_size;\
866 block +=line_size;\
867 a= AV_RN64(pixels );\
868 b= AV_RN64(pixels+1);\
869 l0= (a&0x0303030303030303ULL)\
870 + (b&0x0303030303030303ULL)\
871 + 0x0101010101010101ULL;\
872 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
873 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
874 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
875 pixels+=line_size;\
876 block +=line_size;\
877 }\
878 }\
879 \
880 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
881 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
883 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
886 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
887
888 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
889 #else // 64 bit variant
890
891 #define PIXOP2(OPNAME, OP) \
892 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
893 int i;\
894 for(i=0; i<h; i++){\
895 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
896 pixels+=line_size;\
897 block +=line_size;\
898 }\
899 }\
900 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
901 int i;\
902 for(i=0; i<h; i++){\
903 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
904 pixels+=line_size;\
905 block +=line_size;\
906 }\
907 }\
908 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
909 int i;\
910 for(i=0; i<h; i++){\
911 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
912 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
913 pixels+=line_size;\
914 block +=line_size;\
915 }\
916 }\
917 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
918 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
919 }\
920 \
921 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
922 int src_stride1, int src_stride2, int h){\
923 int i;\
924 for(i=0; i<h; i++){\
925 uint32_t a,b;\
926 a= AV_RN32(&src1[i*src_stride1 ]);\
927 b= AV_RN32(&src2[i*src_stride2 ]);\
928 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
929 a= AV_RN32(&src1[i*src_stride1+4]);\
930 b= AV_RN32(&src2[i*src_stride2+4]);\
931 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
932 }\
933 }\
934 \
935 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
936 int src_stride1, int src_stride2, int h){\
937 int i;\
938 for(i=0; i<h; i++){\
939 uint32_t a,b;\
940 a= AV_RN32(&src1[i*src_stride1 ]);\
941 b= AV_RN32(&src2[i*src_stride2 ]);\
942 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
943 a= AV_RN32(&src1[i*src_stride1+4]);\
944 b= AV_RN32(&src2[i*src_stride2+4]);\
945 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
946 }\
947 }\
948 \
949 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
950 int src_stride1, int src_stride2, int h){\
951 int i;\
952 for(i=0; i<h; i++){\
953 uint32_t a,b;\
954 a= AV_RN32(&src1[i*src_stride1 ]);\
955 b= AV_RN32(&src2[i*src_stride2 ]);\
956 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
957 }\
958 }\
959 \
960 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
961 int src_stride1, int src_stride2, int h){\
962 int i;\
963 for(i=0; i<h; i++){\
964 uint32_t a,b;\
965 a= AV_RN16(&src1[i*src_stride1 ]);\
966 b= AV_RN16(&src2[i*src_stride2 ]);\
967 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
968 }\
969 }\
970 \
971 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
972 int src_stride1, int src_stride2, int h){\
973 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
974 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
975 }\
976 \
977 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
978 int src_stride1, int src_stride2, int h){\
979 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
980 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
981 }\
982 \
983 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
984 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
985 }\
986 \
987 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
988 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
989 }\
990 \
991 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
992 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
993 }\
994 \
995 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
996 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
997 }\
998 \
999 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1000 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001 int i;\
1002 for(i=0; i<h; i++){\
1003 uint32_t a, b, c, d, l0, l1, h0, h1;\
1004 a= AV_RN32(&src1[i*src_stride1]);\
1005 b= AV_RN32(&src2[i*src_stride2]);\
1006 c= AV_RN32(&src3[i*src_stride3]);\
1007 d= AV_RN32(&src4[i*src_stride4]);\
1008 l0= (a&0x03030303UL)\
1009 + (b&0x03030303UL)\
1010 + 0x02020202UL;\
1011 h0= ((a&0xFCFCFCFCUL)>>2)\
1012 + ((b&0xFCFCFCFCUL)>>2);\
1013 l1= (c&0x03030303UL)\
1014 + (d&0x03030303UL);\
1015 h1= ((c&0xFCFCFCFCUL)>>2)\
1016 + ((d&0xFCFCFCFCUL)>>2);\
1017 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018 a= AV_RN32(&src1[i*src_stride1+4]);\
1019 b= AV_RN32(&src2[i*src_stride2+4]);\
1020 c= AV_RN32(&src3[i*src_stride3+4]);\
1021 d= AV_RN32(&src4[i*src_stride4+4]);\
1022 l0= (a&0x03030303UL)\
1023 + (b&0x03030303UL)\
1024 + 0x02020202UL;\
1025 h0= ((a&0xFCFCFCFCUL)>>2)\
1026 + ((b&0xFCFCFCFCUL)>>2);\
1027 l1= (c&0x03030303UL)\
1028 + (d&0x03030303UL);\
1029 h1= ((c&0xFCFCFCFCUL)>>2)\
1030 + ((d&0xFCFCFCFCUL)>>2);\
1031 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032 }\
1033 }\
1034 \
1035 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1037 }\
1038 \
1039 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1041 }\
1042 \
1043 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1045 }\
1046 \
1047 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1049 }\
1050 \
1051 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1052 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1053 int i;\
1054 for(i=0; i<h; i++){\
1055 uint32_t a, b, c, d, l0, l1, h0, h1;\
1056 a= AV_RN32(&src1[i*src_stride1]);\
1057 b= AV_RN32(&src2[i*src_stride2]);\
1058 c= AV_RN32(&src3[i*src_stride3]);\
1059 d= AV_RN32(&src4[i*src_stride4]);\
1060 l0= (a&0x03030303UL)\
1061 + (b&0x03030303UL)\
1062 + 0x01010101UL;\
1063 h0= ((a&0xFCFCFCFCUL)>>2)\
1064 + ((b&0xFCFCFCFCUL)>>2);\
1065 l1= (c&0x03030303UL)\
1066 + (d&0x03030303UL);\
1067 h1= ((c&0xFCFCFCFCUL)>>2)\
1068 + ((d&0xFCFCFCFCUL)>>2);\
1069 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1070 a= AV_RN32(&src1[i*src_stride1+4]);\
1071 b= AV_RN32(&src2[i*src_stride2+4]);\
1072 c= AV_RN32(&src3[i*src_stride3+4]);\
1073 d= AV_RN32(&src4[i*src_stride4+4]);\
1074 l0= (a&0x03030303UL)\
1075 + (b&0x03030303UL)\
1076 + 0x01010101UL;\
1077 h0= ((a&0xFCFCFCFCUL)>>2)\
1078 + ((b&0xFCFCFCFCUL)>>2);\
1079 l1= (c&0x03030303UL)\
1080 + (d&0x03030303UL);\
1081 h1= ((c&0xFCFCFCFCUL)>>2)\
1082 + ((d&0xFCFCFCFCUL)>>2);\
1083 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1084 }\
1085 }\
1086 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1087 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1088 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090 }\
1091 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1092 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1093 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1095 }\
1096 \
1097 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098 {\
1099 int i, a0, b0, a1, b1;\
1100 a0= pixels[0];\
1101 b0= pixels[1] + 2;\
1102 a0 += b0;\
1103 b0 += pixels[2];\
1104 \
1105 pixels+=line_size;\
1106 for(i=0; i<h; i+=2){\
1107 a1= pixels[0];\
1108 b1= pixels[1];\
1109 a1 += b1;\
1110 b1 += pixels[2];\
1111 \
1112 block[0]= (a1+a0)>>2; /* FIXME non put */\
1113 block[1]= (b1+b0)>>2;\
1114 \
1115 pixels+=line_size;\
1116 block +=line_size;\
1117 \
1118 a0= pixels[0];\
1119 b0= pixels[1] + 2;\
1120 a0 += b0;\
1121 b0 += pixels[2];\
1122 \
1123 block[0]= (a1+a0)>>2;\
1124 block[1]= (b1+b0)>>2;\
1125 pixels+=line_size;\
1126 block +=line_size;\
1127 }\
1128 }\
1129 \
1130 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1131 {\
1132 int i;\
1133 const uint32_t a= AV_RN32(pixels );\
1134 const uint32_t b= AV_RN32(pixels+1);\
1135 uint32_t l0= (a&0x03030303UL)\
1136 + (b&0x03030303UL)\
1137 + 0x02020202UL;\
1138 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139 + ((b&0xFCFCFCFCUL)>>2);\
1140 uint32_t l1,h1;\
1141 \
1142 pixels+=line_size;\
1143 for(i=0; i<h; i+=2){\
1144 uint32_t a= AV_RN32(pixels );\
1145 uint32_t b= AV_RN32(pixels+1);\
1146 l1= (a&0x03030303UL)\
1147 + (b&0x03030303UL);\
1148 h1= ((a&0xFCFCFCFCUL)>>2)\
1149 + ((b&0xFCFCFCFCUL)>>2);\
1150 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151 pixels+=line_size;\
1152 block +=line_size;\
1153 a= AV_RN32(pixels );\
1154 b= AV_RN32(pixels+1);\
1155 l0= (a&0x03030303UL)\
1156 + (b&0x03030303UL)\
1157 + 0x02020202UL;\
1158 h0= ((a&0xFCFCFCFCUL)>>2)\
1159 + ((b&0xFCFCFCFCUL)>>2);\
1160 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161 pixels+=line_size;\
1162 block +=line_size;\
1163 }\
1164 }\
1165 \
1166 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1167 {\
1168 int j;\
1169 for(j=0; j<2; j++){\
1170 int i;\
1171 const uint32_t a= AV_RN32(pixels );\
1172 const uint32_t b= AV_RN32(pixels+1);\
1173 uint32_t l0= (a&0x03030303UL)\
1174 + (b&0x03030303UL)\
1175 + 0x02020202UL;\
1176 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1177 + ((b&0xFCFCFCFCUL)>>2);\
1178 uint32_t l1,h1;\
1179 \
1180 pixels+=line_size;\
1181 for(i=0; i<h; i+=2){\
1182 uint32_t a= AV_RN32(pixels );\
1183 uint32_t b= AV_RN32(pixels+1);\
1184 l1= (a&0x03030303UL)\
1185 + (b&0x03030303UL);\
1186 h1= ((a&0xFCFCFCFCUL)>>2)\
1187 + ((b&0xFCFCFCFCUL)>>2);\
1188 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189 pixels+=line_size;\
1190 block +=line_size;\
1191 a= AV_RN32(pixels );\
1192 b= AV_RN32(pixels+1);\
1193 l0= (a&0x03030303UL)\
1194 + (b&0x03030303UL)\
1195 + 0x02020202UL;\
1196 h0= ((a&0xFCFCFCFCUL)>>2)\
1197 + ((b&0xFCFCFCFCUL)>>2);\
1198 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1199 pixels+=line_size;\
1200 block +=line_size;\
1201 }\
1202 pixels+=4-line_size*(h+1);\
1203 block +=4-line_size*h;\
1204 }\
1205 }\
1206 \
1207 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1208 {\
1209 int j;\
1210 for(j=0; j<2; j++){\
1211 int i;\
1212 const uint32_t a= AV_RN32(pixels );\
1213 const uint32_t b= AV_RN32(pixels+1);\
1214 uint32_t l0= (a&0x03030303UL)\
1215 + (b&0x03030303UL)\
1216 + 0x01010101UL;\
1217 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1218 + ((b&0xFCFCFCFCUL)>>2);\
1219 uint32_t l1,h1;\
1220 \
1221 pixels+=line_size;\
1222 for(i=0; i<h; i+=2){\
1223 uint32_t a= AV_RN32(pixels );\
1224 uint32_t b= AV_RN32(pixels+1);\
1225 l1= (a&0x03030303UL)\
1226 + (b&0x03030303UL);\
1227 h1= ((a&0xFCFCFCFCUL)>>2)\
1228 + ((b&0xFCFCFCFCUL)>>2);\
1229 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1230 pixels+=line_size;\
1231 block +=line_size;\
1232 a= AV_RN32(pixels );\
1233 b= AV_RN32(pixels+1);\
1234 l0= (a&0x03030303UL)\
1235 + (b&0x03030303UL)\
1236 + 0x01010101UL;\
1237 h0= ((a&0xFCFCFCFCUL)>>2)\
1238 + ((b&0xFCFCFCFCUL)>>2);\
1239 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1240 pixels+=line_size;\
1241 block +=line_size;\
1242 }\
1243 pixels+=4-line_size*(h+1);\
1244 block +=4-line_size*h;\
1245 }\
1246 }\
1247 \
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1256
1257 #define op_avg(a, b) a = rnd_avg32(a, b)
1258 #endif
1259 #define op_put(a, b) a = b
1260
1261 PIXOP2(avg, op_avg)
1262 PIXOP2(put, op_put)
1263 #undef op_avg
1264 #undef op_put
1265
1266 #define avg2(a,b) ((a+b+1)>>1)
1267 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1268
1269 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1271 }
1272
1273 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1275 }
1276
1277 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1278 {
1279 const int A=(16-x16)*(16-y16);
1280 const int B=( x16)*(16-y16);
1281 const int C=(16-x16)*( y16);
1282 const int D=( x16)*( y16);
1283 int i;
1284
1285 for(i=0; i<h; i++)
1286 {
1287 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1288 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1289 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1290 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1291 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1292 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1293 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1294 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1295 dst+= stride;
1296 src+= stride;
1297 }
1298 }
1299
1300 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1301 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1302 {
1303 int y, vx, vy;
1304 const int s= 1<<shift;
1305
1306 width--;
1307 height--;
1308
1309 for(y=0; y<h; y++){
1310 int x;
1311
1312 vx= ox;
1313 vy= oy;
1314 for(x=0; x<8; x++){ //XXX FIXME optimize
1315 int src_x, src_y, frac_x, frac_y, index;
1316
1317 src_x= vx>>16;
1318 src_y= vy>>16;
1319 frac_x= src_x&(s-1);
1320 frac_y= src_y&(s-1);
1321 src_x>>=shift;
1322 src_y>>=shift;
1323
1324 if((unsigned)src_x < width){
1325 if((unsigned)src_y < height){
1326 index= src_x + src_y*stride;
1327 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1328 + src[index +1]* frac_x )*(s-frac_y)
1329 + ( src[index+stride ]*(s-frac_x)
1330 + src[index+stride+1]* frac_x )* frac_y
1331 + r)>>(shift*2);
1332 }else{
1333 index= src_x + av_clip(src_y, 0, height)*stride;
1334 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1335 + src[index +1]* frac_x )*s
1336 + r)>>(shift*2);
1337 }
1338 }else{
1339 if((unsigned)src_y < height){
1340 index= av_clip(src_x, 0, width) + src_y*stride;
1341 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1342 + src[index+stride ]* frac_y )*s
1343 + r)>>(shift*2);
1344 }else{
1345 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1346 dst[y*stride + x]= src[index ];
1347 }
1348 }
1349
1350 vx+= dxx;
1351 vy+= dyx;
1352 }
1353 ox += dxy;
1354 oy += dyy;
1355 }
1356 }
1357
1358 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359 switch(width){
1360 case 2: put_pixels2_c (dst, src, stride, height); break;
1361 case 4: put_pixels4_c (dst, src, stride, height); break;
1362 case 8: put_pixels8_c (dst, src, stride, height); break;
1363 case 16:put_pixels16_c(dst, src, stride, height); break;
1364 }
1365 }
1366
1367 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368 int i,j;
1369 for (i=0; i < height; i++) {
1370 for (j=0; j < width; j++) {
1371 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1372 }
1373 src += stride;
1374 dst += stride;
1375 }
1376 }
1377
1378 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379 int i,j;
1380 for (i=0; i < height; i++) {
1381 for (j=0; j < width; j++) {
1382 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1383 }
1384 src += stride;
1385 dst += stride;
1386 }
1387 }
1388
1389 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390 int i,j;
1391 for (i=0; i < height; i++) {
1392 for (j=0; j < width; j++) {
1393 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1394 }
1395 src += stride;
1396 dst += stride;
1397 }
1398 }
1399
1400 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401 int i,j;
1402 for (i=0; i < height; i++) {
1403 for (j=0; j < width; j++) {
1404 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1405 }
1406 src += stride;
1407 dst += stride;
1408 }
1409 }
1410
1411 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412 int i,j;
1413 for (i=0; i < height; i++) {
1414 for (j=0; j < width; j++) {
1415 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1416 }
1417 src += stride;
1418 dst += stride;
1419 }
1420 }
1421
1422 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423 int i,j;
1424 for (i=0; i < height; i++) {
1425 for (j=0; j < width; j++) {
1426 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1427 }
1428 src += stride;
1429 dst += stride;
1430 }
1431 }
1432
1433 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434 int i,j;
1435 for (i=0; i < height; i++) {
1436 for (j=0; j < width; j++) {
1437 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1438 }
1439 src += stride;
1440 dst += stride;
1441 }
1442 }
1443
1444 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445 int i,j;
1446 for (i=0; i < height; i++) {
1447 for (j=0; j < width; j++) {
1448 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1449 }
1450 src += stride;
1451 dst += stride;
1452 }
1453 }
1454
1455 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456 switch(width){
1457 case 2: avg_pixels2_c (dst, src, stride, height); break;
1458 case 4: avg_pixels4_c (dst, src, stride, height); break;
1459 case 8: avg_pixels8_c (dst, src, stride, height); break;
1460 case 16:avg_pixels16_c(dst, src, stride, height); break;
1461 }
1462 }
1463
1464 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1465 int i,j;
1466 for (i=0; i < height; i++) {
1467 for (j=0; j < width; j++) {
1468 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1469 }
1470 src += stride;
1471 dst += stride;
1472 }
1473 }
1474
1475 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1476 int i,j;
1477 for (i=0; i < height; i++) {
1478 for (j=0; j < width; j++) {
1479 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1480 }
1481 src += stride;
1482 dst += stride;
1483 }
1484 }
1485
1486 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1487 int i,j;
1488 for (i=0; i < height; i++) {
1489 for (j=0; j < width; j++) {
1490 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1491 }
1492 src += stride;
1493 dst += stride;
1494 }
1495 }
1496
1497 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1498 int i,j;
1499 for (i=0; i < height; i++) {
1500 for (j=0; j < width; j++) {
1501 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1502 }
1503 src += stride;
1504 dst += stride;
1505 }
1506 }
1507
1508 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1509 int i,j;
1510 for (i=0; i < height; i++) {
1511 for (j=0; j < width; j++) {
1512 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1513 }
1514 src += stride;
1515 dst += stride;
1516 }
1517 }
1518
1519 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1520 int i,j;
1521 for (i=0; i < height; i++) {
1522 for (j=0; j < width; j++) {
1523 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1524 }
1525 src += stride;
1526 dst += stride;
1527 }
1528 }
1529
1530 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531 int i,j;
1532 for (i=0; i < height; i++) {
1533 for (j=0; j < width; j++) {
1534 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1535 }
1536 src += stride;
1537 dst += stride;
1538 }
1539 }
1540
1541 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542 int i,j;
1543 for (i=0; i < height; i++) {
1544 for (j=0; j < width; j++) {
1545 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1546 }
1547 src += stride;
1548 dst += stride;
1549 }
1550 }
1551 #if 0
1552 #define TPEL_WIDTH(width)\
1553 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1555 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1557 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1559 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1561 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1563 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1565 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1567 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1569 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1571 #endif
1572
1573 #define H264_CHROMA_MC(OPNAME, OP)\
1574 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1575 const int A=(8-x)*(8-y);\
1576 const int B=( x)*(8-y);\
1577 const int C=(8-x)*( y);\
1578 const int D=( x)*( y);\
1579 int i;\
1580 \
1581 assert(x<8 && y<8 && x>=0 && y>=0);\
1582 \
1583 if(D){\
1584 for(i=0; i<h; i++){\
1585 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1586 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1587 dst+= stride;\
1588 src+= stride;\
1589 }\
1590 }else{\
1591 const int E= B+C;\
1592 const int step= C ? stride : 1;\
1593 for(i=0; i<h; i++){\
1594 OP(dst[0], (A*src[0] + E*src[step+0]));\
1595 OP(dst[1], (A*src[1] + E*src[step+1]));\
1596 dst+= stride;\
1597 src+= stride;\
1598 }\
1599 }\
1600 }\
1601 \
1602 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1603 const int A=(8-x)*(8-y);\
1604 const int B=( x)*(8-y);\
1605 const int C=(8-x)*( y);\
1606 const int D=( x)*( y);\
1607 int i;\
1608 \
1609 assert(x<8 && y<8 && x>=0 && y>=0);\
1610 \
1611 if(D){\
1612 for(i=0; i<h; i++){\
1613 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1614 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1615 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1616 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1617 dst+= stride;\
1618 src+= stride;\
1619 }\
1620 }else{\
1621 const int E= B+C;\
1622 const int step= C ? stride : 1;\
1623 for(i=0; i<h; i++){\
1624 OP(dst[0], (A*src[0] + E*src[step+0]));\
1625 OP(dst[1], (A*src[1] + E*src[step+1]));\
1626 OP(dst[2], (A*src[2] + E*src[step+2]));\
1627 OP(dst[3], (A*src[3] + E*src[step+3]));\
1628 dst+= stride;\
1629 src+= stride;\
1630 }\
1631 }\
1632 }\
1633 \
1634 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1635 const int A=(8-x)*(8-y);\
1636 const int B=( x)*(8-y);\
1637 const int C=(8-x)*( y);\
1638 const int D=( x)*( y);\
1639 int i;\
1640 \
1641 assert(x<8 && y<8 && x>=0 && y>=0);\
1642 \
1643 if(D){\
1644 for(i=0; i<h; i++){\
1645 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1646 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1647 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1648 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1649 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1650 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1651 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1652 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1653 dst+= stride;\
1654 src+= stride;\
1655 }\
1656 }else{\
1657 const int E= B+C;\
1658 const int step= C ? stride : 1;\
1659 for(i=0; i<h; i++){\
1660 OP(dst[0], (A*src[0] + E*src[step+0]));\
1661 OP(dst[1], (A*src[1] + E*src[step+1]));\
1662 OP(dst[2], (A*src[2] + E*src[step+2]));\
1663 OP(dst[3], (A*src[3] + E*src[step+3]));\
1664 OP(dst[4], (A*src[4] + E*src[step+4]));\
1665 OP(dst[5], (A*src[5] + E*src[step+5]));\
1666 OP(dst[6], (A*src[6] + E*src[step+6]));\
1667 OP(dst[7], (A*src[7] + E*src[step+7]));\
1668 dst+= stride;\
1669 src+= stride;\
1670 }\
1671 }\
1672 }
1673
1674 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1675 #define op_put(a, b) a = (((b) + 32)>>6)
1676
1677 H264_CHROMA_MC(put_ , op_put)
1678 H264_CHROMA_MC(avg_ , op_avg)
1679 #undef op_avg
1680 #undef op_put
1681
1682 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1683 const int A=(8-x)*(8-y);
1684 const int B=( x)*(8-y);
1685 const int C=(8-x)*( y);
1686 const int D=( x)*( y);
1687 int i;
1688
1689 assert(x<8 && y<8 && x>=0 && y>=0);
1690
1691 for(i=0; i<h; i++)
1692 {
1693 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1694 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1695 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1696 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1697 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1698 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1699 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1700 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1701 dst+= stride;
1702 src+= stride;
1703 }
1704 }
1705
1706 #define QPEL_MC(r, OPNAME, RND, OP) \
1707 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1708 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1709 int i;\
1710 for(i=0; i<h; i++)\
1711 {\
1712 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1713 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1714 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1715 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1716 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1717 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1718 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1719 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1720 dst+=dstStride;\
1721 src+=srcStride;\
1722 }\
1723 }\
1724 \
1725 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1726 const int w=8;\
1727 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1728 int i;\
1729 for(i=0; i<w; i++)\
1730 {\
1731 const int src0= src[0*srcStride];\
1732 const int src1= src[1*srcStride];\
1733 const int src2= src[2*srcStride];\
1734 const int src3= src[3*srcStride];\
1735 const int src4= src[4*srcStride];\
1736 const int src5= src[5*srcStride];\
1737 const int src6= src[6*srcStride];\
1738 const int src7= src[7*srcStride];\
1739 const int src8= src[8*srcStride];\
1740 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1741 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1742 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1743 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1744 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1745 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1746 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1747 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1748 dst++;\
1749 src++;\
1750 }\
1751 }\
1752 \
1753 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1754 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755 int i;\
1756 \
1757 for(i=0; i<h; i++)\
1758 {\
1759 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1760 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1761 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1762 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1763 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1764 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1765 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1766 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1767 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1768 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1769 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1770 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1771 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1772 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1773 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1774 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1775 dst+=dstStride;\
1776 src+=srcStride;\
1777 }\
1778 }\
1779 \
1780 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1781 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1782 int i;\
1783 const int w=16;\
1784 for(i=0; i<w; i++)\
1785 {\
1786 const int src0= src[0*srcStride];\
1787 const int src1= src[1*srcStride];\
1788 const int src2= src[2*srcStride];\
1789 const int src3= src[3*srcStride];\
1790 const int src4= src[4*srcStride];\
1791 const int src5= src[5*srcStride];\
1792 const int src6= src[6*srcStride];\
1793 const int src7= src[7*srcStride];\
1794 const int src8= src[8*srcStride];\
1795 const int src9= src[9*srcStride];\
1796 const int src10= src[10*srcStride];\
1797 const int src11= src[11*srcStride];\
1798 const int src12= src[12*srcStride];\
1799 const int src13= src[13*srcStride];\
1800 const int src14= src[14*srcStride];\
1801 const int src15= src[15*srcStride];\
1802 const int src16= src[16*srcStride];\
1803 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1804 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1805 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1806 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1807 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1808 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1809 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1810 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1811 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1812 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1813 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1814 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1815 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1816 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1817 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1818 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1819 dst++;\
1820 src++;\
1821 }\
1822 }\
1823 \
1824 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1825 OPNAME ## pixels8_c(dst, src, stride, 8);\
1826 }\
1827 \
1828 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1829 uint8_t half[64];\
1830 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1831 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1832 }\
1833 \
1834 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1835 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1836 }\
1837 \
1838 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1839 uint8_t half[64];\
1840 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1841 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1842 }\
1843 \
1844 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1845 uint8_t full[16*9];\
1846 uint8_t half[64];\
1847 copy_block9(full, src, 16, stride, 9);\
1848 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1849 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1850 }\
1851 \
1852 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1853 uint8_t full[16*9];\
1854 copy_block9(full, src, 16, stride, 9);\
1855 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1856 }\
1857 \
1858 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1859 uint8_t full[16*9];\
1860 uint8_t half[64];\
1861 copy_block9(full, src, 16, stride, 9);\
1862 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1863 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1864 }\
1865 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1866 uint8_t full[16*9];\
1867 uint8_t halfH[72];\
1868 uint8_t halfV[64];\
1869 uint8_t halfHV[64];\
1870 copy_block9(full, src, 16, stride, 9);\
1871 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1872 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1873 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1874 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1875 }\
1876 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1877 uint8_t full[16*9];\
1878 uint8_t halfH[72];\
1879 uint8_t halfHV[64];\
1880 copy_block9(full, src, 16, stride, 9);\
1881 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1882 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1883 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1884 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1885 }\
1886 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887 uint8_t full[16*9];\
1888 uint8_t halfH[72];\
1889 uint8_t halfV[64];\
1890 uint8_t halfHV[64];\
1891 copy_block9(full, src, 16, stride, 9);\
1892 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1893 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1894 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1896 }\
1897 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1898 uint8_t full[16*9];\
1899 uint8_t halfH[72];\
1900 uint8_t halfHV[64];\
1901 copy_block9(full, src, 16, stride, 9);\
1902 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1903 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1904 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1905 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1906 }\
1907 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908 uint8_t full[16*9];\
1909 uint8_t halfH[72];\
1910 uint8_t halfV[64];\
1911 uint8_t halfHV[64];\
1912 copy_block9(full, src, 16, stride, 9);\
1913 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1914 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1915 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1916 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1917 }\
1918 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1919 uint8_t full[16*9];\
1920 uint8_t halfH[72];\
1921 uint8_t halfHV[64];\
1922 copy_block9(full, src, 16, stride, 9);\
1923 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1924 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1925 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1926 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1927 }\
1928 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929 uint8_t full[16*9];\
1930 uint8_t halfH[72];\
1931 uint8_t halfV[64];\
1932 uint8_t halfHV[64];\
1933 copy_block9(full, src, 16, stride, 9);\
1934 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1935 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1936 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1937 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1938 }\
1939 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1940 uint8_t full[16*9];\
1941 uint8_t halfH[72];\
1942 uint8_t halfHV[64];\
1943 copy_block9(full, src, 16, stride, 9);\
1944 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1945 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1946 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1947 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1948 }\
1949 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1950 uint8_t halfH[72];\
1951 uint8_t halfHV[64];\
1952 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1953 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1955 }\
1956 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1957 uint8_t halfH[72];\
1958 uint8_t halfHV[64];\
1959 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1960 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1961 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1962 }\
1963 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1964 uint8_t full[16*9];\
1965 uint8_t halfH[72];\
1966 uint8_t halfV[64];\
1967 uint8_t halfHV[64];\
1968 copy_block9(full, src, 16, stride, 9);\
1969 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1970 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1971 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1972 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1973 }\
1974 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1975 uint8_t full[16*9];\
1976 uint8_t halfH[72];\
1977 copy_block9(full, src, 16, stride, 9);\
1978 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1979 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1980 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1981 }\
1982 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983 uint8_t full[16*9];\
1984 uint8_t halfH[72];\
1985 uint8_t halfV[64];\
1986 uint8_t halfHV[64];\
1987 copy_block9(full, src, 16, stride, 9);\
1988 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1989 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1990 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1991 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1992 }\
1993 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t full[16*9];\
1995 uint8_t halfH[72];\
1996 copy_block9(full, src, 16, stride, 9);\
1997 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1998 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1999 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2000 }\
2001 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2002 uint8_t halfH[72];\
2003 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2004 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2005 }\
2006 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2007 OPNAME ## pixels16_c(dst, src, stride, 16);\
2008 }\
2009 \
2010 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2011 uint8_t half[256];\
2012 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2013 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2014 }\
2015 \
2016 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2017 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2018 }\
2019 \
2020 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2021 uint8_t half[256];\
2022 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2023 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2024 }\
2025 \
2026 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2027 uint8_t full[24*17];\
2028 uint8_t half[256];\
2029 copy_block17(full, src, 24, stride, 17);\
2030 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2031 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2032 }\
2033 \
2034 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2035 uint8_t full[24*17];\
2036 copy_block17(full, src, 24, stride, 17);\
2037 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2038 }\
2039 \
2040 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2041 uint8_t full[24*17];\
2042 uint8_t half[256];\
2043 copy_block17(full, src, 24, stride, 17);\
2044 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2045 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2046 }\
2047 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2048 uint8_t full[24*17];\
2049 uint8_t halfH[272];\
2050 uint8_t halfV[256];\
2051 uint8_t halfHV[256];\
2052 copy_block17(full, src, 24, stride, 17);\
2053 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2054 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2055 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2056 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2057 }\
2058 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2059 uint8_t full[24*17];\
2060 uint8_t halfH[272];\
2061 uint8_t halfHV[256];\
2062 copy_block17(full, src, 24, stride, 17);\
2063 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2064 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2065 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2066 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2067 }\
2068 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2069 uint8_t full[24*17];\
2070 uint8_t halfH[272];\
2071 uint8_t halfV[256];\
2072 uint8_t halfHV[256];\
2073 copy_block17(full, src, 24, stride, 17);\
2074 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2075 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2076 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2077 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2078 }\
2079 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2080 uint8_t full[24*17];\
2081 uint8_t halfH[272];\
2082 uint8_t halfHV[256];\
2083 copy_block17(full, src, 24, stride, 17);\
2084 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2085 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2086 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2087 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2088 }\
2089 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2090 uint8_t full[24*17];\
2091 uint8_t halfH[272];\
2092 uint8_t halfV[256];\
2093 uint8_t halfHV[256];\
2094 copy_block17(full, src, 24, stride, 17);\
2095 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2096 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2097 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2098 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2099 }\
2100 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2101 uint8_t full[24*17];\
2102 uint8_t halfH[272];\
2103 uint8_t halfHV[256];\
2104 copy_block17(full, src, 24, stride, 17);\
2105 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2106 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2107 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2108 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2109 }\
2110 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2111 uint8_t full[24*17];\
2112 uint8_t halfH[272];\
2113 uint8_t halfV[256];\
2114 uint8_t halfHV[256];\
2115 copy_block17(full, src, 24, stride, 17);\
2116 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2117 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2118 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2119 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2120 }\
2121 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2122 uint8_t full[24*17];\
2123 uint8_t halfH[272];\
2124 uint8_t halfHV[256];\
2125 copy_block17(full, src, 24, stride, 17);\
2126 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2127 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2128 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2129 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2130 }\
2131 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2132 uint8_t halfH[272];\
2133 uint8_t halfHV[256];\
2134 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2135 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2137 }\
2138 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2139 uint8_t halfH[272];\
2140 uint8_t halfHV[256];\
2141 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2142 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2143 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2144 }\
2145 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2146 uint8_t full[24*17];\
2147 uint8_t halfH[272];\
2148 uint8_t halfV[256];\
2149 uint8_t halfHV[256];\
2150 copy_block17(full, src, 24, stride, 17);\
2151 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2152 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2153 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2154 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2155 }\
2156 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2157 uint8_t full[24*17];\
2158 uint8_t halfH[272];\
2159 copy_block17(full, src, 24, stride, 17);\
2160 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2161 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2162 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2163 }\
2164 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2165 uint8_t full[24*17];\
2166 uint8_t halfH[272];\
2167 uint8_t halfV[256];\
2168 uint8_t halfHV[256];\
2169 copy_block17(full, src, 24, stride, 17);\
2170 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2171 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2172 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2173 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2174 }\
2175 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2176 uint8_t full[24*17];\
2177 uint8_t halfH[272];\
2178 copy_block17(full, src, 24, stride, 17);\
2179 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2180 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2181 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2182 }\
2183 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2184 uint8_t halfH[272];\
2185 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2186 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2187 }
2188
2189 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2190 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2191 #define op_put(a, b) a = cm[((b) + 16)>>5]
2192 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2193
2194 QPEL_MC(0, put_ , _ , op_put)
2195 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2196 QPEL_MC(0, avg_ , _ , op_avg)
2197 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2198 #undef op_avg
2199 #undef op_avg_no_rnd
2200 #undef op_put
2201 #undef op_put_no_rnd
2202
2203 #if 1
2204 #define H264_LOWPASS(OPNAME, OP, OP2) \
2205 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2206 const int h=2;\
2207 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208 int i;\
2209 for(i=0; i<h; i++)\
2210 {\
2211 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2212 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2213 dst+=dstStride;\
2214 src+=srcStride;\
2215 }\
2216 }\
2217 \
2218 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2219 const int w=2;\
2220 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2221 int i;\
2222 for(i=0; i<w; i++)\
2223 {\
2224 const int srcB= src[-2*srcStride];\
2225 const int srcA= src[-1*srcStride];\
2226 const int src0= src[0 *srcStride];\
2227 const int src1= src[1 *srcStride];\
2228 const int src2= src[2 *srcStride];\
2229 const int src3= src[3 *srcStride];\
2230 const int src4= src[4 *srcStride];\
2231 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2232 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2233 dst++;\
2234 src++;\
2235 }\
2236 }\
2237 \
2238 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2239 const int h=2;\
2240 const int w=2;\
2241 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2242 int i;\
2243 src -= 2*srcStride;\
2244 for(i=0; i<h+5; i++)\
2245 {\
2246 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2247 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2248 tmp+=tmpStride;\
2249 src+=srcStride;\
2250 }\
2251 tmp -= tmpStride*(h+5-2);\
2252 for(i=0; i<w; i++)\
2253 {\
2254 const int tmpB= tmp[-2*tmpStride];\
2255 const int tmpA= tmp[-1*tmpStride];\
2256 const int tmp0= tmp[0 *tmpStride];\
2257 const int tmp1= tmp[1 *tmpStride];\
2258 const int tmp2= tmp[2 *tmpStride];\
2259 const int tmp3= tmp[3 *tmpStride];\
2260 const int tmp4= tmp[4 *tmpStride];\
2261 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2262 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2263 dst++;\
2264 tmp++;\
2265 }\
2266 }\
2267 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2268 const int h=4;\
2269 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2270 int i;\
2271 for(i=0; i<h; i++)\
2272 {\
2273 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2274 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2275 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2276 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2277 dst+=dstStride;\
2278 src+=srcStride;\
2279 }\
2280 }\
2281 \
2282 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2283 const int w=4;\
2284 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2285 int i;\
2286 for(i=0; i<w; i++)\
2287 {\
2288 const int srcB= src[-2*srcStride];\
2289 const int srcA= src[-1*srcStride];\
2290 const int src0= src[0 *srcStride];\
2291 const int src1= src[1 *srcStride];\
2292 const int src2= src[2 *srcStride];\
2293 const int src3= src[3 *srcStride];\
2294 const int src4= src[4 *srcStride];\
2295 const int src5= src[5 *srcStride];\
2296 const int src6= src[6 *srcStride];\
2297 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2298 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2299 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2300 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2301 dst++;\
2302 src++;\
2303 }\
2304 }\
2305 \
2306 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2307 const int h=4;\
2308 const int w=4;\
2309 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2310 int i;\
2311 src -= 2*srcStride;\
2312 for(i=0; i<h+5; i++)\
2313 {\
2314 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2315 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2316 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2317 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2318 tmp+=tmpStride;\
2319 src+=srcStride;\
2320 }\
2321 tmp -= tmpStride*(h+5-2);\
2322 for(i=0; i<w; i++)\
2323 {\
2324 const int tmpB= tmp[-2*tmpStride];\
2325 const int tmpA= tmp[-1*tmpStride];\
2326 const int tmp0= tmp[0 *tmpStride];\
2327 const int tmp1= tmp[1 *tmpStride];\
2328 const int tmp2= tmp[2 *tmpStride];\
2329 const int tmp3= tmp[3 *tmpStride];\
2330 const int tmp4= tmp[4 *tmpStride];\
2331 const int tmp5= tmp[5 *tmpStride];\
2332 const int tmp6= tmp[6 *tmpStride];\
2333 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2334 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2335 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2336 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2337 dst++;\
2338 tmp++;\
2339 }\
2340 }\
2341 \
2342 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2343 const int h=8;\
2344 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2345 int i;\
2346 for(i=0; i<h; i++)\
2347 {\
2348 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2349 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2350 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2351 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2352 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2353 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2354 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2355 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2356 dst+=dstStride;\
2357 src+=srcStride;\
2358 }\
2359 }\
2360 \
2361 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2362 const int w=8;\
2363 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2364 int i;\
2365 for(i=0; i<w; i++)\
2366 {\
2367 const int srcB= src[-2*srcStride];\
2368 const int srcA= src[-1*srcStride];\
2369 const int src0= src[0 *srcStride];\
2370 const int src1= src[1 *srcStride];\
2371 const int src2= src[2 *srcStride];\
2372 const int src3= src[3 *srcStride];\
2373 const int src4= src[4 *srcStride];\
2374 const int src5= src[5 *srcStride];\
2375 const int src6= src[6 *srcStride];\
2376 const int src7= src[7 *srcStride];\
2377 const int src8= src[8 *srcStride];\
2378 const int src9= src[9 *srcStride];\
2379 const int src10=src[10*srcStride];\
2380 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2381 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2382 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2383 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2384 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2385 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2386 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2387 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2388 dst++;\
2389 src++;\
2390 }\
2391 }\
2392 \
2393 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2394 const int h=8;\
2395 const int w=8;\
2396 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2397 int i;\
2398 src -= 2*srcStride;\
2399 for(i=0; i<h+5; i++)\
2400 {\
2401 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2402 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2403 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2404 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2405 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2406 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2407 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2408 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2409 tmp+=tmpStride;\
2410 src+=srcStride;\
2411 }\
2412 tmp -= tmpStride*(h+5-2);\
2413 for(i=0; i<w; i++)\
2414 {\
2415 const int tmpB= tmp[-2*tmpStride];\
2416 const int tmpA= tmp[-1*tmpStride];\
2417 const int tmp0= tmp[0 *tmpStride];\
2418 const int tmp1= tmp[1 *tmpStride];\
2419 const int tmp2= tmp[2 *tmpStride];\
2420 const int tmp3= tmp[3 *tmpStride];\
2421 const int tmp4= tmp[4 *tmpStride];\
2422 const int tmp5= tmp[5 *tmpStride];\
2423 const int tmp6= tmp[6 *tmpStride];\
2424 const int tmp7= tmp[7 *tmpStride];\
2425 const int tmp8= tmp[8 *tmpStride];\
2426 const int tmp9= tmp[9 *tmpStride];\
2427 const int tmp10=tmp[10*tmpStride];\
2428 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2429 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2430 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2431 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2432 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2433 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2434 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2435 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2436 dst++;\
2437 tmp++;\
2438 }\
2439 }\
2440 \
2441 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2442 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2443 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2444 src += 8*srcStride;\
2445 dst += 8*dstStride;\
2446 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2447 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2448 }\
2449 \
2450 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2451 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2452 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2453 src += 8*srcStride;\
2454 dst += 8*dstStride;\
2455 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2456 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2457 }\
2458 \
2459 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2460 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2461 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2462 src += 8*srcStride;\
2463 dst += 8*dstStride;\
2464 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2465 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2466 }\
2467
2468 #define H264_MC(OPNAME, SIZE) \
2469 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2470 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2471 }\
2472 \
2473 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2474 uint8_t half[SIZE*SIZE];\
2475 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2476 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2477 }\
2478 \
2479 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2480 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2481 }\
2482 \
2483 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2484 uint8_t half[SIZE*SIZE];\
2485 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2486 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2487 }\
2488 \
2489 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2490 uint8_t full[SIZE*(SIZE+5)];\
2491 uint8_t * const full_mid= full + SIZE*2;\
2492 uint8_t half[SIZE*SIZE];\
2493 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2494 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2495 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2496 }\
2497 \
2498 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2499 uint8_t full[SIZE*(SIZE+5)];\
2500 uint8_t * const full_mid= full + SIZE*2;\
2501 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2502 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2503 }\
2504 \
2505 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2506 uint8_t full[SIZE*(SIZE+5)];\
2507 uint8_t * const full_mid= full + SIZE*2;\
2508 uint8_t half[SIZE*SIZE];\
2509 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2510 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2511 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2512 }\
2513 \
2514 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2515 uint8_t full[SIZE*(SIZE+5)];\
2516 uint8_t * const full_mid= full + SIZE*2;\
2517 uint8_t halfH[SIZE*SIZE];\
2518 uint8_t halfV[SIZE*SIZE];\
2519 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2520 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2521 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2522 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2523 }\
2524 \
2525 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2526 uint8_t full[SIZE*(SIZE+5)];\
2527 uint8_t * const full_mid= full + SIZE*2;\
2528 uint8_t halfH[SIZE*SIZE];\
2529 uint8_t halfV[SIZE*SIZE];\
2530 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2531 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2532 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2533 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2534 }\
2535 \
2536 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2537 uint8_t full[SIZE*(SIZE+5)];\
2538 uint8_t * const full_mid= full + SIZE*2;\
2539 uint8_t halfH[SIZE*SIZE];\
2540 uint8_t halfV[SIZE*SIZE];\
2541 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2542 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2543 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2544 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2545 }\
2546 \
2547 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2548 uint8_t full[SIZE*(SIZE+5)];\
2549 uint8_t * const full_mid= full + SIZE*2;\
2550 uint8_t halfH[SIZE*SIZE];\
2551 uint8_t halfV[SIZE*SIZE];\
2552 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2553 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2554 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2555 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2556 }\
2557 \
2558 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2559 int16_t tmp[SIZE*(SIZE+5)];\
2560 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2561 }\
2562 \
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2564 int16_t tmp[SIZE*(SIZE+5)];\
2565 uint8_t halfH[SIZE*SIZE];\
2566 uint8_t halfHV[SIZE*SIZE];\
2567 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2568 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2569 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2570 }\
2571 \
2572 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2573 int16_t tmp[SIZE*(SIZE+5)];\
2574 uint8_t halfH[SIZE*SIZE];\
2575 uint8_t halfHV[SIZE*SIZE];\
2576 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2577 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2578 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2579 }\
2580 \
2581 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2582 uint8_t full[SIZE*(SIZE+5)];\
2583 uint8_t * const full_mid= full + SIZE*2;\
2584 int16_t tmp[SIZE*(SIZE+5)];\
2585 uint8_t halfV[SIZE*SIZE];\
2586 uint8_t halfHV[SIZE*SIZE];\
2587 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2588 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2589 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2590 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2591 }\
2592 \
2593 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2594 uint8_t full[SIZE*(SIZE+5)];\
2595 uint8_t * const full_mid= full + SIZE*2;\
2596 int16_t tmp[SIZE*(SIZE+5)];\
2597 uint8_t halfV[SIZE*SIZE];\
2598 uint8_t halfHV[SIZE*SIZE];\
2599 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2600 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2601 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2602 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2603 }\
2604
2605 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2606 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2607 #define op_put(a, b) a = cm[((b) + 16)>>5]
2608 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2609 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2610
2611 H264_LOWPASS(put_ , op_put, op2_put)
2612 H264_LOWPASS(avg_ , op_avg, op2_avg)
2613 H264_MC(put_, 2)
2614 H264_MC(put_, 4)
2615 H264_MC(put_, 8)
2616 H264_MC(put_, 16)
2617 H264_MC(avg_, 4)
2618 H264_MC(avg_, 8)
2619 H264_MC(avg_, 16)
2620
2621 #undef op_avg
2622 #undef op_put
2623 #undef op2_avg
2624 #undef op2_put
2625 #endif
2626
2627 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2628 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2629 #define H264_WEIGHT(W,H) \
2630 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2631 int y; \
2632 offset <<= log2_denom; \
2633 if(log2_denom) offset += 1<<(log2_denom-1); \
2634 for(y=0; y<H; y++, block += stride){ \
2635 op_scale1(0); \
2636 op_scale1(1); \
2637 if(W==2) continue; \
2638 op_scale1(2); \
2639 op_scale1(3); \
2640 if(W==4) continue; \
2641 op_scale1(4); \
2642 op_scale1(5); \
2643 op_scale1(6); \
2644 op_scale1(7); \
2645 if(W==8) continue; \
2646 op_scale1(8); \
2647 op_scale1(9); \
2648 op_scale1(10); \
2649 op_scale1(11); \
2650 op_scale1(12); \
2651 op_scale1(13); \
2652 op_scale1(14); \
2653 op_scale1(15); \
2654 } \
2655 } \
2656 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2657 int y; \
2658 offset = ((offset + 1) | 1) << log2_denom; \
2659 for(y=0; y<H; y++, dst += stride, src += stride){ \
2660 op_scale2(0); \
2661 op_scale2(1); \
2662 if(W==2) continue; \
2663 op_scale2(2); \
2664 op_scale2(3); \
2665 if(W==4) continue; \
2666 op_scale2(4); \
2667 op_scale2(5); \
2668 op_scale2(6); \
2669 op_scale2(7); \
2670 if(W==8) continue; \
2671 op_scale2(8); \
2672 op_scale2(9); \
2673 op_scale2(10); \
2674 op_scale2(11); \
2675 op_scale2(12); \
2676 op_scale2(13); \
2677 op_scale2(14); \
2678 op_scale2(15); \
2679 } \
2680 }
2681
2682 H264_WEIGHT(16,16)
2683 H264_WEIGHT(16,8)
2684 H264_WEIGHT(8,16)
2685 H264_WEIGHT(8,8)
2686 H264_WEIGHT(8,4)
2687 H264_WEIGHT(4,8)
2688 H264_WEIGHT(4,4)
2689 H264_WEIGHT(4,2)
2690 H264_WEIGHT(2,4)
2691 H264_WEIGHT(2,2)
2692
2693 #undef op_scale1
2694 #undef op_scale2
2695 #undef H264_WEIGHT
2696
2697 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2698 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2699 int i;
2700
2701 for(i=0; i<h; i++){
2702 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2703 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2704 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2705 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2706 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2707 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2708 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2709 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2710 dst+=dstStride;
2711 src+=srcStride;
2712 }
2713 }
2714
2715 #if CONFIG_CAVS_DECODER
2716 /* AVS specific */
2717 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2718
2719 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2720 put_pixels8_c(dst, src, stride, 8);
2721 }
2722 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2723 avg_pixels8_c(dst, src, stride, 8);
2724 }
2725 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2726 put_pixels16_c(dst, src, stride, 16);
2727 }
2728 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2729 avg_pixels16_c(dst, src, stride, 16);
2730 }
2731 #endif /* CONFIG_CAVS_DECODER */
2732
2733 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
2734 /* VC-1 specific */
2735 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2736
2737 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2738 put_pixels8_c(dst, src, stride, 8);
2739 }
2740 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2741
2742 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2743
2744 /* H264 specific */
2745 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2746
2747 #if CONFIG_RV30_DECODER
2748 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2749 #endif /* CONFIG_RV30_DECODER */
2750
2751 #if CONFIG_RV40_DECODER
2752 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2753 put_pixels16_xy2_c(dst, src, stride, 16);
2754 }
2755 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2756 avg_pixels16_xy2_c(dst, src, stride, 16);
2757 }
2758 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2759 put_pixels8_xy2_c(dst, src, stride, 8);
2760 }
2761 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2762 avg_pixels8_xy2_c(dst, src, stride, 8);
2763 }
2764
2765 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2766 #endif /* CONFIG_RV40_DECODER */
2767
2768 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2769 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2770 int i;
2771
2772 for(i=0; i<w; i++){
2773 const int src_1= src[ -srcStride];
2774 const int src0 = src[0 ];
2775 const int src1 = src[ srcStride];
2776 const int src2 = src[2*srcStride];
2777 const int src3 = src[3*srcStride];
2778 const int src4 = src[4*srcStride];
2779 const int src5 = src[5*srcStride];
2780 const int src6 = src[6*srcStride];
2781 const int src7 = src[7*srcStride];
2782 const int src8 = src[8*srcStride];
2783 const int src9 = src[9*srcStride];
2784 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2785 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2786 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2787 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2788 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2789 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2790 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2791 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2792 src++;
2793 dst++;
2794 }
2795 }
2796
2797 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2798 put_pixels8_c(dst, src, stride, 8);
2799 }
2800
2801 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2802 uint8_t half[64];
2803 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2804 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2805 }
2806
2807 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2808 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2809 }
2810
2811 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2812 uint8_t half[64];
2813 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2814 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2815 }
2816
2817 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2818 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2819 }
2820
2821 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2822 uint8_t halfH[88];
2823 uint8_t halfV[64];
2824 uint8_t halfHV[64];
2825 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2826 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2827 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2828 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2829 }
2830 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2831 uint8_t halfH[88];
2832 uint8_t halfV[64];
2833 uint8_t halfHV[64];
2834 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2835 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2836 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2837 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2838 }
2839 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2840 uint8_t halfH[88];
2841 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2842 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2843 }
2844
2845 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2846 if(CONFIG_ANY_H263) {
2847 int x;
2848 const int strength= ff_h263_loop_filter_strength[qscale];
2849
2850 for(x=0; x<8; x++){
2851 int d1, d2, ad1;
2852 int p0= src[x-2*stride];
2853 int p1= src[x-1*stride];
2854 int p2= src[x+0*stride];
2855 int p3= src[x+1*stride];
2856 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2857
2858 if (d<-2*strength) d1= 0;
2859 else if(d<- strength) d1=-2*strength - d;
2860 else if(d< strength) d1= d;
2861 else if(d< 2*strength) d1= 2*strength - d;
2862 else d1= 0;
2863
2864 p1 += d1;
2865 p2 -= d1;
2866 if(p1&256) p1= ~(p1>>31);
2867 if(p2&256) p2= ~(p2>>31);
2868
2869 src[x-1*stride] = p1;
2870 src[x+0*stride] = p2;
2871
2872 ad1= FFABS(d1)>>1;
2873
2874 d2= av_clip((p0-p3)/4, -ad1, ad1);
2875
2876 src[x-2*stride] = p0 - d2;
2877 src[x+ stride] = p3 + d2;
2878 }
2879 }
2880 }
2881
2882 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2883 if(CONFIG_ANY_H263) {
2884 int y;
2885 const int strength= ff_h263_loop_filter_strength[qscale];
2886
2887 for(y=0; y<8; y++){
2888 int d1, d2, ad1;
2889 int p0= src[y*stride-2];
2890 int p1= src[y*stride-1];
2891 int p2= src[y*stride+0];
2892 int p3= src[y*stride+1];
2893 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2894
2895 if (d<-2*strength) d1= 0;
2896 else if(d<- strength) d1=-2*strength - d;
2897 else if(d< strength) d1= d;
2898 else if(d< 2*strength) d1= 2*strength - d;
2899 else d1= 0;
2900
2901 p1 += d1;
2902 p2 -= d1;
2903 if(p1&256) p1= ~(p1>>31);
2904 if(p2&256) p2= ~(p2>>31);
2905
2906 src[y*stride-1] = p1;
2907 src[y*stride+0] = p2;
2908
2909 ad1= FFABS(d1)>>1;
2910
2911 d2= av_clip((p0-p3)/4, -ad1, ad1);
2912
2913 src[y*stride-2] = p0 - d2;
2914 src[y*stride+1] = p3 + d2;
2915 }
2916 }
2917 }
2918
2919 static void h261_loop_filter_c(uint8_t *src, int stride){
2920 int x,y,xy,yz;
2921 int temp[64];
2922
2923 for(x=0; x<8; x++){
2924 temp[x ] = 4*src[x ];
2925 temp[x + 7*8] = 4*src[x + 7*stride];
2926 }
2927 for(y=1; y<7; y++){
2928 for(x=0; x<8; x++){
2929 xy = y * stride + x;
2930 yz = y * 8 + x;
2931 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2932 }
2933 }
2934
2935 for(y=0; y<8; y++){
2936 src[ y*stride] = (temp[ y*8] + 2)>>2;
2937 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2938 for(x=1; x<7; x++){
2939 xy = y * stride + x;
2940 yz = y * 8 + x;
2941 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2942 }
2943 }
2944 }
2945
2946 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2947 {
2948 int i, d;
2949 for( i = 0; i < 4; i++ ) {
2950 if( tc0[i] < 0 ) {
2951 pix += 4*ystride;
2952 continue;
2953 }
2954 for( d = 0; d < 4; d++ ) {
2955 const int p0 = pix[-1*xstride];
2956 const int p1 = pix[-2*xstride];
2957 const int p2 = pix[-3*xstride];
2958 const int q0 = pix[0];
2959 const int q1 = pix[1*xstride];
2960 const int q2 = pix[2*xstride];
2961
2962 if( FFABS( p0 - q0 ) < alpha &&
2963 FFABS( p1 - p0 ) < beta &&
2964 FFABS( q1 - q0 ) < beta ) {
2965
2966 int tc = tc0[i];
2967 int i_delta;
2968
2969 if( FFABS( p2 - p0 ) < beta ) {
2970 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2971 tc++;
2972 }
2973 if( FFABS( q2 - q0 ) < beta ) {
2974 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2975 tc++;
2976 }
2977
2978 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2979 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2980 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2981 }
2982 pix += ystride;
2983 }
2984 }
2985 }
2986 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2987 {
2988 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2989 }
2990 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2991 {
2992 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2993 }
2994
2995 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2996 {
2997 int d;
2998 for( d = 0; d < 16; d++ ) {
2999 const int p2 = pix[-3*xstride];
3000 const int p1 = pix[-2*xstride];
3001 const int p0 = pix[-1*xstride];
3002
3003 const int q0 = pix[ 0*xstride];
3004 const int q1 = pix[ 1*xstride];
3005 const int q2 = pix[ 2*xstride];
3006
3007 if( FFABS( p0 - q0 ) < alpha &&
3008 FFABS( p1 - p0 ) < beta &&
3009 FFABS( q1 - q0 ) < beta ) {
3010
3011 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3012 if( FFABS( p2 - p0 ) < beta)
3013 {
3014 const int p3 = pix[-4*xstride];
3015 /* p0', p1', p2' */
3016 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3017 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3018 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3019 } else {
3020 /* p0' */
3021 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3022 }
3023 if( FFABS( q2 - q0 ) < beta)
3024 {
3025 const int q3 = pix[3*xstride];
3026 /* q0', q1', q2' */
3027 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3028 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3029 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3030 } else {
3031 /* q0' */
3032 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3033 }
3034 }else{
3035 /* p0', q0' */
3036 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3037 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3038 }
3039 }
3040 pix += ystride;
3041 }
3042 }
3043 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3044 {
3045 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3046 }
3047 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3048 {
3049 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3050 }
3051
3052 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3053 {
3054 int i, d;
3055 for( i = 0; i < 4; i++ ) {
3056 const int tc = tc0[i];
3057 if( tc <= 0 ) {
3058 pix += 2*ystride;
3059 continue;
3060 }
3061 for( d = 0; d < 2; d++ ) {
3062 const int p0 = pix[-1*xstride];
3063 const int p1 = pix[-2*xstride];
3064 const int q0 = pix[0];
3065 const int q1 = pix[1*xstride];
3066
3067 if( FFABS( p0 - q0 ) < alpha &&
3068 FFABS( p1 - p0 ) < beta &&
3069 FFABS( q1 - q0 ) < beta ) {
3070
3071 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3072
3073 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3074 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3075 }
3076 pix += ystride;
3077 }
3078 }
3079 }
3080 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3081 {
3082 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3083 }
3084 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3085 {
3086 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3087 }
3088
3089 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3090 {
3091 int d;
3092 for( d = 0; d < 8; d++ ) {
3093 const int p0 = pix[-1*xstride];
3094 const int p1 = pix[-2*xstride];
3095 const int q0 = pix[0];
3096 const int q1 = pix[1*xstride];
3097
3098 if( FFABS( p0 - q0 ) < alpha &&
3099 FFABS( p1 - p0 ) < beta &&
3100 FFABS( q1 - q0 ) < beta ) {
3101
3102 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3103 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3104 }
3105 pix += ystride;
3106 }
3107 }
3108 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3109 {
3110 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3111 }
3112 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3113 {
3114 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3115 }
3116
3117 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3118 {
3119 int s, i;
3120
3121 s = 0;
3122 for(i=0;i<h;i++) {
3123 s += abs(pix1[0] - pix2[0]);
3124 s += abs(pix1[1] - pix2[1]);
3125 s += abs(pix1[2] - pix2[2]);
3126 s += abs(pix1[3] - pix2[3]);
3127 s += abs(pix1[4] - pix2[4]);
3128 s += abs(pix1[5] - pix2[5]);
3129 s += abs(pix1[6] - pix2[6]);
3130 s += abs(pix1[7] - pix2[7]);
3131 s += abs(pix1[8] - pix2[8]);
3132 s += abs(pix1[9] - pix2[9]);
3133 s += abs(pix1[10] - pix2[10]);
3134 s += abs(pix1[11] - pix2[11]);
3135 s += abs(pix1[12] - pix2[12]);
3136 s += abs(pix1[13] - pix2[13]);
3137 s += abs(pix1[14] - pix2[14]);
3138 s += abs(pix1[15] - pix2[15]);
3139 pix1 += line_size;
3140 pix2 += line_size;
3141 }
3142 return s;
3143 }
3144
3145 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3146 {
3147 int s, i;
3148
3149 s = 0;
3150 for(i=0;i<h;i++) {
3151 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3152 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3153 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3154 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3155 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3156 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3157 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3158 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3159 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3160 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3161 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3162 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3163 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3164 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3165 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3166 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3167 pix1 += line_size;
3168 pix2 += line_size;
3169 }
3170 return s;
3171 }
3172
3173 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3174 {
3175 int s, i;
3176 uint8_t *pix3 = pix2 + line_size;
3177
3178 s = 0;
3179 for(i=0;i<h;i++) {
3180 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3181 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3182 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3183 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3184 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3185 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3186 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3187 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3188 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3189 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3190 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3191 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3192 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3193 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3194 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3195 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3196 pix1 += line_size;
3197 pix2 += line_size;
3198 pix3 += line_size;
3199 }
3200 return s;
3201 }
3202
3203 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3204 {
3205 int s, i;
3206 uint8_t *pix3 = pix2 + line_size;
3207
3208 s = 0;
3209 for(i=0;i<h;i++) {
3210 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3211 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3212 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3213 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3214 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3215 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3216 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3217 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3218 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3219 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3220 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3221 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3222 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3223 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3224 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3225 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3226 pix1 += line_size;
3227 pix2 += line_size;
3228 pix3 += line_size;
3229 }
3230 return s;
3231 }
3232
3233 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3234 {
3235 int s, i;
3236
3237 s = 0;
3238 for(i=0;i<h;i++) {
3239 s += abs(pix1[0] - pix2[0]);
3240 s += abs(pix1[1] - pix2[1]);
3241 s += abs(pix1[2] - pix2[2]);
3242 s += abs(pix1[3] - pix2[3]);
3243 s += abs(pix1[4] - pix2[4]);
3244 s += abs(pix1[5] - pix2[5]);
3245 s += abs(pix1[6] - pix2[6]);
3246 s += abs(pix1[7] - pix2[7]);
3247 pix1 += line_size;
3248 pix2 += line_size;
3249 }
3250 return s;
3251 }
3252
3253 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3254 {
3255 int s, i;
3256
3257 s = 0;
3258 for(i=0;i<h;i++) {
3259 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3260 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3261 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3262 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3263 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3264 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3265 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3266 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3267 pix1 += line_size;
3268 pix2 += line_size;
3269 }
3270 return s;
3271 }
3272
3273 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3274 {
3275 int s, i;
3276 uint8_t *pix3 = pix2 + line_size;
3277
3278 s = 0;
3279 for(i=0;i<h;i++) {
3280 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3281 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3282 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3283 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3284 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3285 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3286 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3287 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3288 pix1 += line_size;
3289 pix2 += line_size;
3290 pix3 += line_size;
3291 }
3292 return s;
3293 }
3294
3295 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3296 {
3297 int s, i;
3298 uint8_t *pix3 = pix2 + line_size;
3299
3300 s = 0;
3301 for(i=0;i<h;i++) {
3302 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3303 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3304 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3305 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3306 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3307 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3308 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3309 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3310 pix1 += line_size;
3311 pix2 += line_size;
3312 pix3 += line_size;
3313 }
3314 return s;
3315 }
3316
3317 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3318 MpegEncContext *c = v;
3319 int score1=0;
3320 int score2=0;
3321 int x,y;
3322
3323 for(y=0; y<h; y++){
3324 for(x=0; x<16; x++){
3325 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3326 }
3327 if(y+1<h){
3328 for(x=0; x<15; x++){
3329 score2+= FFABS( s1[x ] - s1[x +stride]
3330 - s1[x+1] + s1[x+1+stride])
3331 -FFABS( s2[x ] - s2[x +stride]
3332 - s2[x+1] + s2[x+1+stride]);
3333 }
3334 }
3335 s1+= stride;
3336 s2+= stride;
3337 }
3338
3339 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3340 else return score1 + FFABS(score2)*8;
3341 }
3342
3343 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3344 MpegEncContext *c = v;
3345 int score1=0;
3346 int score2=0;
3347 int x,y;
3348
3349 for(y=0; y<h; y++){
3350 for(x=0; x<8; x++){
3351 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3352 }
3353 if(y+1<h){
3354 for(x=0; x<7; x++){
3355 score2+= FFABS( s1[x ] - s1[x +stride]
3356 - s1[x+1] + s1[x+1+stride])
3357 -FFABS( s2[x ] - s2[x +stride]
3358 - s2[x+1] + s2[x+1+stride]);
3359 }
3360 }
3361 s1+= stride;
3362 s2+= stride;
3363 }
3364
3365 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3366 else return score1 + FFABS(score2)*8;
3367 }
3368
3369 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3370 int i;
3371 unsigned int sum=0;
3372
3373 for(i=0; i<8*8; i++){
3374 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3375 int w= weight[i];
3376 b>>= RECON_SHIFT;
3377 assert(-512<b && b<512);
3378
3379 sum += (w*b)*(w*b)>>4;
3380 }
3381 return sum>>2;
3382 }
3383
3384 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3385 int i;
3386
3387 for(i=0; i<8*8; i++){
3388 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3389 }
3390 }
3391
3392 /**
3393 * permutes an 8x8 block.
3394 * @param block the block which will be permuted according to the given permutation vector
3395 * @param permutation the permutation vector
3396 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3397 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3398 * (inverse) permutated to scantable order!
3399 */
3400 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3401 {
3402 int i;
3403 DCTELEM temp[64];
3404
3405 if(last<=0) return;
3406 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3407
3408 for(i=0; i<=last; i++){
3409 const int j= scantable[i];
3410 temp[j]= block[j];
3411 block[j]=0;
3412 }
3413
3414 for(i=0; i<=last; i++){
3415 const int j= scantable[i];
3416 const int perm_j= permutation[j];
3417 block[perm_j]= temp[j];
3418 }
3419 }
3420
3421 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3422 return 0;
3423 }
3424
3425 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3426 int i;
3427
3428 memset(cmp, 0, sizeof(void*)*5);
3429
3430 for(i=0; i<5; i++){
3431 switch(type&0xFF){
3432 case FF_CMP_SAD:
3433 cmp[i]= c->sad[i];
3434 break;
3435 case FF_CMP_SATD:
3436 cmp[i]= c->hadamard8_diff[i];
3437 break;
3438 case FF_CMP_SSE:
3439 cmp[i]= c->sse[i];
3440 break;
3441 case FF_CMP_DCT:
3442 cmp[i]= c->dct_sad[i];
3443 break;
3444 case FF_CMP_DCT264:
3445 cmp[i]= c->dct264_sad[i];
3446 break;
3447 case FF_CMP_DCTMAX:
3448 cmp[i]= c->dct_max[i];
3449 break;
3450 case FF_CMP_PSNR:
3451 cmp[i]= c->quant_psnr[i];
3452 break;
3453 case FF_CMP_BIT:
3454 cmp[i]= c->bit[i];
3455 break;
3456 case FF_CMP_RD:
3457 cmp[i]= c->rd[i];
3458 break;
3459 case FF_CMP_VSAD:
3460 cmp[i]= c->vsad[i];
3461 break;
3462 case FF_CMP_VSSE:
3463 cmp[i]= c->vsse[i];
3464 break;
3465 case FF_CMP_ZERO:
3466 cmp[i]= zero_cmp;
3467 break;
3468 case FF_CMP_NSSE:
3469 cmp[i]= c->nsse[i];
3470 break;
3471 #if CONFIG_SNOW_ENCODER
3472 case FF_CMP_W53:
3473 cmp[i]= c->w53[i];
3474 break;
3475 case FF_CMP_W97:
3476 cmp[i]= c->w97[i];