cb9d38100ca99e0d0cbfd158311fdd4acb9741bc
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /**
26 * @file libavcodec/dsputil.c
27 * DSP utils
28 */
29
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "mathops.h"
36 #include "snow.h"
37 #include "mpegvideo.h"
38 #include "config.h"
39 #include "lpc.h"
40 #include "ac3dec.h"
41 #include "vorbis.h"
42 #include "png.h"
43
44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45 uint32_t ff_squareTbl[512] = {0, };
46
47 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
48 #define pb_7f (~0UL/255 * 0x7f)
49 #define pb_80 (~0UL/255 * 0x80)
50
51 const uint8_t ff_zigzag_direct[64] = {
52 0, 1, 8, 16, 9, 2, 3, 10,
53 17, 24, 32, 25, 18, 11, 4, 5,
54 12, 19, 26, 33, 40, 48, 41, 34,
55 27, 20, 13, 6, 7, 14, 21, 28,
56 35, 42, 49, 56, 57, 50, 43, 36,
57 29, 22, 15, 23, 30, 37, 44, 51,
58 58, 59, 52, 45, 38, 31, 39, 46,
59 53, 60, 61, 54, 47, 55, 62, 63
60 };
61
62 /* Specific zigzag scan for 248 idct. NOTE that unlike the
63 specification, we interleave the fields */
64 const uint8_t ff_zigzag248_direct[64] = {
65 0, 8, 1, 9, 16, 24, 2, 10,
66 17, 25, 32, 40, 48, 56, 33, 41,
67 18, 26, 3, 11, 4, 12, 19, 27,
68 34, 42, 49, 57, 50, 58, 35, 43,
69 20, 28, 5, 13, 6, 14, 21, 29,
70 36, 44, 51, 59, 52, 60, 37, 45,
71 22, 30, 7, 15, 23, 31, 38, 46,
72 53, 61, 54, 62, 39, 47, 55, 63,
73 };
74
75 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
76 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77
78 const uint8_t ff_alternate_horizontal_scan[64] = {
79 0, 1, 2, 3, 8, 9, 16, 17,
80 10, 11, 4, 5, 6, 7, 15, 14,
81 13, 12, 19, 18, 24, 25, 32, 33,
82 26, 27, 20, 21, 22, 23, 28, 29,
83 30, 31, 34, 35, 40, 41, 48, 49,
84 42, 43, 36, 37, 38, 39, 44, 45,
85 46, 47, 50, 51, 56, 57, 58, 59,
86 52, 53, 54, 55, 60, 61, 62, 63,
87 };
88
89 const uint8_t ff_alternate_vertical_scan[64] = {
90 0, 8, 16, 24, 1, 9, 2, 10,
91 17, 25, 32, 40, 48, 56, 57, 49,
92 41, 33, 26, 18, 3, 11, 4, 12,
93 19, 27, 34, 42, 50, 58, 35, 43,
94 51, 59, 20, 28, 5, 13, 6, 14,
95 21, 29, 36, 44, 52, 60, 37, 45,
96 53, 61, 22, 30, 7, 15, 23, 31,
97 38, 46, 54, 62, 39, 47, 55, 63,
98 };
99
100 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
101 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
102 const uint32_t ff_inverse[257]={
103 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
104 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
105 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
106 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
107 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
108 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
109 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
110 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
111 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
112 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
113 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
114 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
115 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
116 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
117 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
118 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
119 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
120 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
121 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
122 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
123 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
124 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
125 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
126 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
127 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
128 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
129 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
130 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
131 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
132 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
133 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
134 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
135 16777216
136 };
137
138 /* Input permutation for the simple_idct_mmx */
139 static const uint8_t simple_mmx_permutation[64]={
140 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
141 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
142 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
143 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
144 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
145 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
146 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
147 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
148 };
149
150 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
151
152 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
153 int i;
154 int end;
155
156 st->scantable= src_scantable;
157
158 for(i=0; i<64; i++){
159 int j;
160 j = src_scantable[i];
161 st->permutated[i] = permutation[j];
162 #if ARCH_PPC
163 st->inverse[j] = i;
164 #endif
165 }
166
167 end=-1;
168 for(i=0; i<64; i++){
169 int j;
170 j = st->permutated[i];
171 if(j>end) end=j;
172 st->raster_end[i]= end;
173 }
174 }
175
176 static int pix_sum_c(uint8_t * pix, int line_size)
177 {
178 int s, i, j;
179
180 s = 0;
181 for (i = 0; i < 16; i++) {
182 for (j = 0; j < 16; j += 8) {
183 s += pix[0];
184 s += pix[1];
185 s += pix[2];
186 s += pix[3];
187 s += pix[4];
188 s += pix[5];
189 s += pix[6];
190 s += pix[7];
191 pix += 8;
192 }
193 pix += line_size - 16;
194 }
195 return s;
196 }
197
198 static int pix_norm1_c(uint8_t * pix, int line_size)
199 {
200 int s, i, j;
201 uint32_t *sq = ff_squareTbl + 256;
202
203 s = 0;
204 for (i = 0; i < 16; i++) {
205 for (j = 0; j < 16; j += 8) {
206 #if 0
207 s += sq[pix[0]];
208 s += sq[pix[1]];
209 s += sq[pix[2]];
210 s += sq[pix[3]];
211 s += sq[pix[4]];
212 s += sq[pix[5]];
213 s += sq[pix[6]];
214 s += sq[pix[7]];
215 #else
216 #if LONG_MAX > 2147483647
217 register uint64_t x=*(uint64_t*)pix;
218 s += sq[x&0xff];
219 s += sq[(x>>8)&0xff];
220 s += sq[(x>>16)&0xff];
221 s += sq[(x>>24)&0xff];
222 s += sq[(x>>32)&0xff];
223 s += sq[(x>>40)&0xff];
224 s += sq[(x>>48)&0xff];
225 s += sq[(x>>56)&0xff];
226 #else
227 register uint32_t x=*(uint32_t*)pix;
228 s += sq[x&0xff];
229 s += sq[(x>>8)&0xff];
230 s += sq[(x>>16)&0xff];
231 s += sq[(x>>24)&0xff];
232 x=*(uint32_t*)(pix+4);
233 s += sq[x&0xff];
234 s += sq[(x>>8)&0xff];
235 s += sq[(x>>16)&0xff];
236 s += sq[(x>>24)&0xff];
237 #endif
238 #endif
239 pix += 8;
240 }
241 pix += line_size - 16;
242 }
243 return s;
244 }
245
246 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
247 int i;
248
249 for(i=0; i+8<=w; i+=8){
250 dst[i+0]= bswap_32(src[i+0]);
251 dst[i+1]= bswap_32(src[i+1]);
252 dst[i+2]= bswap_32(src[i+2]);
253 dst[i+3]= bswap_32(src[i+3]);
254 dst[i+4]= bswap_32(src[i+4]);
255 dst[i+5]= bswap_32(src[i+5]);
256 dst[i+6]= bswap_32(src[i+6]);
257 dst[i+7]= bswap_32(src[i+7]);
258 }
259 for(;i<w; i++){
260 dst[i+0]= bswap_32(src[i+0]);
261 }
262 }
263
264 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
265 {
266 int s, i;
267 uint32_t *sq = ff_squareTbl + 256;
268
269 s = 0;
270 for (i = 0; i < h; i++) {
271 s += sq[pix1[0] - pix2[0]];
272 s += sq[pix1[1] - pix2[1]];
273 s += sq[pix1[2] - pix2[2]];
274 s += sq[pix1[3] - pix2[3]];
275 pix1 += line_size;
276 pix2 += line_size;
277 }
278 return s;
279 }
280
281 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
282 {
283 int s, i;
284 uint32_t *sq = ff_squareTbl + 256;
285
286 s = 0;
287 for (i = 0; i < h; i++) {
288 s += sq[pix1[0] - pix2[0]];
289 s += sq[pix1[1] - pix2[1]];
290 s += sq[pix1[2] - pix2[2]];
291 s += sq[pix1[3] - pix2[3]];
292 s += sq[pix1[4] - pix2[4]];
293 s += sq[pix1[5] - pix2[5]];
294 s += sq[pix1[6] - pix2[6]];
295 s += sq[pix1[7] - pix2[7]];
296 pix1 += line_size;
297 pix2 += line_size;
298 }
299 return s;
300 }
301
302 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
303 {
304 int s, i;
305 uint32_t *sq = ff_squareTbl + 256;
306
307 s = 0;
308 for (i = 0; i < h; i++) {
309 s += sq[pix1[ 0] - pix2[ 0]];
310 s += sq[pix1[ 1] - pix2[ 1]];
311 s += sq[pix1[ 2] - pix2[ 2]];
312 s += sq[pix1[ 3] - pix2[ 3]];
313 s += sq[pix1[ 4] - pix2[ 4]];
314 s += sq[pix1[ 5] - pix2[ 5]];
315 s += sq[pix1[ 6] - pix2[ 6]];
316 s += sq[pix1[ 7] - pix2[ 7]];
317 s += sq[pix1[ 8] - pix2[ 8]];
318 s += sq[pix1[ 9] - pix2[ 9]];
319 s += sq[pix1[10] - pix2[10]];
320 s += sq[pix1[11] - pix2[11]];
321 s += sq[pix1[12] - pix2[12]];
322 s += sq[pix1[13] - pix2[13]];
323 s += sq[pix1[14] - pix2[14]];
324 s += sq[pix1[15] - pix2[15]];
325
326 pix1 += line_size;
327 pix2 += line_size;
328 }
329 return s;
330 }
331
332
333 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
334 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
335 int s, i, j;
336 const int dec_count= w==8 ? 3 : 4;
337 int tmp[32*32];
338 int level, ori;
339 static const int scale[2][2][4][4]={
340 {
341 {
342 // 9/7 8x8 dec=3
343 {268, 239, 239, 213},
344 { 0, 224, 224, 152},
345 { 0, 135, 135, 110},
346 },{
347 // 9/7 16x16 or 32x32 dec=4
348 {344, 310, 310, 280},
349 { 0, 320, 320, 228},
350 { 0, 175, 175, 136},
351 { 0, 129, 129, 102},
352 }
353 },{
354 {
355 // 5/3 8x8 dec=3
356 {275, 245, 245, 218},
357 { 0, 230, 230, 156},
358 { 0, 138, 138, 113},
359 },{
360 // 5/3 16x16 or 32x32 dec=4
361 {352, 317, 317, 286},
362 { 0, 328, 328, 233},
363 { 0, 180, 180, 140},
364 { 0, 132, 132, 105},
365 }
366 }
367 };
368
369 for (i = 0; i < h; i++) {
370 for (j = 0; j < w; j+=4) {
371 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
372 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
373 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
374 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
375 }
376 pix1 += line_size;
377 pix2 += line_size;
378 }
379
380 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
381
382 s=0;
383 assert(w==h);
384 for(level=0; level<dec_count; level++){
385 for(ori= level ? 1 : 0; ori<4; ori++){
386 int size= w>>(dec_count-level);
387 int sx= (ori&1) ? size : 0;
388 int stride= 32<<(dec_count-level);
389 int sy= (ori&2) ? stride>>1 : 0;
390
391 for(i=0; i<size; i++){
392 for(j=0; j<size; j++){
393 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
394 s += FFABS(v);
395 }
396 }
397 }
398 }
399 assert(s>=0);
400 return s>>9;
401 }
402
403 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
404 return w_c(v, pix1, pix2, line_size, 8, h, 1);
405 }
406
407 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
408 return w_c(v, pix1, pix2, line_size, 8, h, 0);
409 }
410
411 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
412 return w_c(v, pix1, pix2, line_size, 16, h, 1);
413 }
414
415 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
416 return w_c(v, pix1, pix2, line_size, 16, h, 0);
417 }
418
419 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
420 return w_c(v, pix1, pix2, line_size, 32, h, 1);
421 }
422
423 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
424 return w_c(v, pix1, pix2, line_size, 32, h, 0);
425 }
426 #endif
427
428 /* draw the edges of width 'w' of an image of size width, height */
429 //FIXME check that this is ok for mpeg4 interlaced
430 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
431 {
432 uint8_t *ptr, *last_line;
433 int i;
434
435 last_line = buf + (height - 1) * wrap;
436 for(i=0;i<w;i++) {
437 /* top and bottom */
438 memcpy(buf - (i + 1) * wrap, buf, width);
439 memcpy(last_line + (i + 1) * wrap, last_line, width);
440 }
441 /* left and right */
442 ptr = buf;
443 for(i=0;i<height;i++) {
444 memset(ptr - w, ptr[0], w);
445 memset(ptr + width, ptr[width-1], w);
446 ptr += wrap;
447 }
448 /* corners */
449 for(i=0;i<w;i++) {
450 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
451 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
452 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
453 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
454 }
455 }
456
457 /**
458 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
459 * @param buf destination buffer
460 * @param src source buffer
461 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
462 * @param block_w width of block
463 * @param block_h height of block
464 * @param src_x x coordinate of the top left sample of the block in the source buffer
465 * @param src_y y coordinate of the top left sample of the block in the source buffer
466 * @param w width of the source buffer
467 * @param h height of the source buffer
468 */
469 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
470 int src_x, int src_y, int w, int h){
471 int x, y;
472 int start_y, start_x, end_y, end_x;
473
474 if(src_y>= h){
475 src+= (h-1-src_y)*linesize;
476 src_y=h-1;
477 }else if(src_y<=-block_h){
478 src+= (1-block_h-src_y)*linesize;
479 src_y=1-block_h;
480 }
481 if(src_x>= w){
482 src+= (w-1-src_x);
483 src_x=w-1;
484 }else if(src_x<=-block_w){
485 src+= (1-block_w-src_x);
486 src_x=1-block_w;
487 }
488
489 start_y= FFMAX(0, -src_y);
490 start_x= FFMAX(0, -src_x);
491 end_y= FFMIN(block_h, h-src_y);
492 end_x= FFMIN(block_w, w-src_x);
493
494 // copy existing part
495 for(y=start_y; y<end_y; y++){
496 for(x=start_x; x<end_x; x++){
497 buf[x + y*linesize]= src[x + y*linesize];
498 }
499 }
500
501 //top
502 for(y=0; y<start_y; y++){
503 for(x=start_x; x<end_x; x++){
504 buf[x + y*linesize]= buf[x + start_y*linesize];
505 }
506 }
507
508 //bottom
509 for(y=end_y; y<block_h; y++){
510 for(x=start_x; x<end_x; x++){
511 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
512 }
513 }
514
515 for(y=0; y<block_h; y++){
516 //left
517 for(x=0; x<start_x; x++){
518 buf[x + y*linesize]= buf[start_x + y*linesize];
519 }
520
521 //right
522 for(x=end_x; x<block_w; x++){
523 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
524 }
525 }
526 }
527
528 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
529 {
530 int i;
531
532 /* read the pixels */
533 for(i=0;i<8;i++) {
534 block[0] = pixels[0];
535 block[1] = pixels[1];
536 block[2] = pixels[2];
537 block[3] = pixels[3];
538 block[4] = pixels[4];
539 block[5] = pixels[5];
540 block[6] = pixels[6];
541 block[7] = pixels[7];
542 pixels += line_size;
543 block += 8;
544 }
545 }
546
547 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
548 const uint8_t *s2, int stride){
549 int i;
550
551 /* read the pixels */
552 for(i=0;i<8;i++) {
553 block[0] = s1[0] - s2[0];
554 block[1] = s1[1] - s2[1];
555 block[2] = s1[2] - s2[2];
556 block[3] = s1[3] - s2[3];
557 block[4] = s1[4] - s2[4];
558 block[5] = s1[5] - s2[5];
559 block[6] = s1[6] - s2[6];
560 block[7] = s1[7] - s2[7];
561 s1 += stride;
562 s2 += stride;
563 block += 8;
564 }
565 }
566
567
568 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
569 int line_size)
570 {
571 int i;
572 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
573
574 /* read the pixels */
575 for(i=0;i<8;i++) {
576 pixels[0] = cm[block[0]];
577 pixels[1] = cm[block[1]];
578 pixels[2] = cm[block[2]];
579 pixels[3] = cm[block[3]];
580 pixels[4] = cm[block[4]];
581 pixels[5] = cm[block[5]];
582 pixels[6] = cm[block[6]];
583 pixels[7] = cm[block[7]];
584
585 pixels += line_size;
586 block += 8;
587 }
588 }
589
590 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
591 int line_size)
592 {
593 int i;
594 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
595
596 /* read the pixels */
597 for(i=0;i<4;i++) {
598 pixels[0] = cm[block[0]];
599 pixels[1] = cm[block[1]];
600 pixels[2] = cm[block[2]];
601 pixels[3] = cm[block[3]];
602
603 pixels += line_size;
604 block += 8;
605 }
606 }
607
608 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
609 int line_size)
610 {
611 int i;
612 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
613
614 /* read the pixels */
615 for(i=0;i<2;i++) {
616 pixels[0] = cm[block[0]];
617 pixels[1] = cm[block[1]];
618
619 pixels += line_size;
620 block += 8;
621 }
622 }
623
624 static void put_signed_pixels_clamped_c(const DCTELEM *block,
625 uint8_t *restrict pixels,
626 int line_size)
627 {
628 int i, j;
629
630 for (i = 0; i < 8; i++) {
631 for (j = 0; j < 8; j++) {
632 if (*block < -128)
633 *pixels = 0;
634 else if (*block > 127)
635 *pixels = 255;
636 else
637 *pixels = (uint8_t)(*block + 128);
638 block++;
639 pixels++;
640 }
641 pixels += (line_size - 8);
642 }
643 }
644
645 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
646 int line_size)
647 {
648 int i;
649
650 /* read the pixels */
651 for(i=0;i<8;i++) {
652 pixels[0] = block[0];
653 pixels[1] = block[1];
654 pixels[2] = block[2];
655 pixels[3] = block[3];
656 pixels[4] = block[4];
657 pixels[5] = block[5];
658 pixels[6] = block[6];
659 pixels[7] = block[7];
660
661 pixels += line_size;
662 block += 8;
663 }
664 }
665
666 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
667 int line_size)
668 {
669 int i;
670 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
671
672 /* read the pixels */
673 for(i=0;i<8;i++) {
674 pixels[0] = cm[pixels[0] + block[0]];
675 pixels[1] = cm[pixels[1] + block[1]];
676 pixels[2] = cm[pixels[2] + block[2]];
677 pixels[3] = cm[pixels[3] + block[3]];
678 pixels[4] = cm[pixels[4] + block[4]];
679 pixels[5] = cm[pixels[5] + block[5]];
680 pixels[6] = cm[pixels[6] + block[6]];
681 pixels[7] = cm[pixels[7] + block[7]];
682 pixels += line_size;
683 block += 8;
684 }
685 }
686
687 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
688 int line_size)
689 {
690 int i;
691 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
692
693 /* read the pixels */
694 for(i=0;i<4;i++) {
695 pixels[0] = cm[pixels[0] + block[0]];
696 pixels[1] = cm[pixels[1] + block[1]];
697 pixels[2] = cm[pixels[2] + block[2]];
698 pixels[3] = cm[pixels[3] + block[3]];
699 pixels += line_size;
700 block += 8;
701 }
702 }
703
704 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
705 int line_size)
706 {
707 int i;
708 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
709
710 /* read the pixels */
711 for(i=0;i<2;i++) {
712 pixels[0] = cm[pixels[0] + block[0]];
713 pixels[1] = cm[pixels[1] + block[1]];
714 pixels += line_size;
715 block += 8;
716 }
717 }
718
719 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
720 {
721 int i;
722 for(i=0;i<8;i++) {
723 pixels[0] += block[0];
724 pixels[1] += block[1];
725 pixels[2] += block[2];
726 pixels[3] += block[3];
727 pixels[4] += block[4];
728 pixels[5] += block[5];
729 pixels[6] += block[6];
730 pixels[7] += block[7];
731 pixels += line_size;
732 block += 8;
733 }
734 }
735
736 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
737 {
738 int i;
739 for(i=0;i<4;i++) {
740 pixels[0] += block[0];
741 pixels[1] += block[1];
742 pixels[2] += block[2];
743 pixels[3] += block[3];
744 pixels += line_size;
745 block += 4;
746 }
747 }
748
749 static int sum_abs_dctelem_c(DCTELEM *block)
750 {
751 int sum=0, i;
752 for(i=0; i<64; i++)
753 sum+= FFABS(block[i]);
754 return sum;
755 }
756
757 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
758 {
759 int i;
760
761 for (i = 0; i < h; i++) {
762 memset(block, value, 16);
763 block += line_size;
764 }
765 }
766
767 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
768 {
769 int i;
770
771 for (i = 0; i < h; i++) {
772 memset(block, value, 8);
773 block += line_size;
774 }
775 }
776
777 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
778 {
779 int i, j;
780 uint16_t *dst1 = (uint16_t *) dst;
781 uint16_t *dst2 = (uint16_t *)(dst + linesize);
782
783 for (j = 0; j < 8; j++) {
784 for (i = 0; i < 8; i++) {
785 dst1[i] = dst2[i] = src[i] * 0x0101;
786 }
787 src += 8;
788 dst1 += linesize;
789 dst2 += linesize;
790 }
791 }
792
793 #if 0
794
795 #define PIXOP2(OPNAME, OP) \
796 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
797 {\
798 int i;\
799 for(i=0; i<h; i++){\
800 OP(*((uint64_t*)block), AV_RN64(pixels));\
801 pixels+=line_size;\
802 block +=line_size;\
803 }\
804 }\
805 \
806 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
807 {\
808 int i;\
809 for(i=0; i<h; i++){\
810 const uint64_t a= AV_RN64(pixels );\
811 const uint64_t b= AV_RN64(pixels+1);\
812 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
813 pixels+=line_size;\
814 block +=line_size;\
815 }\
816 }\
817 \
818 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
819 {\
820 int i;\
821 for(i=0; i<h; i++){\
822 const uint64_t a= AV_RN64(pixels );\
823 const uint64_t b= AV_RN64(pixels+1);\
824 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
825 pixels+=line_size;\
826 block +=line_size;\
827 }\
828 }\
829 \
830 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
831 {\
832 int i;\
833 for(i=0; i<h; i++){\
834 const uint64_t a= AV_RN64(pixels );\
835 const uint64_t b= AV_RN64(pixels+line_size);\
836 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
837 pixels+=line_size;\
838 block +=line_size;\
839 }\
840 }\
841 \
842 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
843 {\
844 int i;\
845 for(i=0; i<h; i++){\
846 const uint64_t a= AV_RN64(pixels );\
847 const uint64_t b= AV_RN64(pixels+line_size);\
848 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
849 pixels+=line_size;\
850 block +=line_size;\
851 }\
852 }\
853 \
854 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
855 {\
856 int i;\
857 const uint64_t a= AV_RN64(pixels );\
858 const uint64_t b= AV_RN64(pixels+1);\
859 uint64_t l0= (a&0x0303030303030303ULL)\
860 + (b&0x0303030303030303ULL)\
861 + 0x0202020202020202ULL;\
862 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
863 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
864 uint64_t l1,h1;\
865 \
866 pixels+=line_size;\
867 for(i=0; i<h; i+=2){\
868 uint64_t a= AV_RN64(pixels );\
869 uint64_t b= AV_RN64(pixels+1);\
870 l1= (a&0x0303030303030303ULL)\
871 + (b&0x0303030303030303ULL);\
872 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
873 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
874 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
875 pixels+=line_size;\
876 block +=line_size;\
877 a= AV_RN64(pixels );\
878 b= AV_RN64(pixels+1);\
879 l0= (a&0x0303030303030303ULL)\
880 + (b&0x0303030303030303ULL)\
881 + 0x0202020202020202ULL;\
882 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
883 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
884 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
885 pixels+=line_size;\
886 block +=line_size;\
887 }\
888 }\
889 \
890 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
891 {\
892 int i;\
893 const uint64_t a= AV_RN64(pixels );\
894 const uint64_t b= AV_RN64(pixels+1);\
895 uint64_t l0= (a&0x0303030303030303ULL)\
896 + (b&0x0303030303030303ULL)\
897 + 0x0101010101010101ULL;\
898 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
899 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
900 uint64_t l1,h1;\
901 \
902 pixels+=line_size;\
903 for(i=0; i<h; i+=2){\
904 uint64_t a= AV_RN64(pixels );\
905 uint64_t b= AV_RN64(pixels+1);\
906 l1= (a&0x0303030303030303ULL)\
907 + (b&0x0303030303030303ULL);\
908 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
909 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
910 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
911 pixels+=line_size;\
912 block +=line_size;\
913 a= AV_RN64(pixels );\
914 b= AV_RN64(pixels+1);\
915 l0= (a&0x0303030303030303ULL)\
916 + (b&0x0303030303030303ULL)\
917 + 0x0101010101010101ULL;\
918 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
919 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
920 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
921 pixels+=line_size;\
922 block +=line_size;\
923 }\
924 }\
925 \
926 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
927 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
928 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
929 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
930 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
931 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
932 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
933
934 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
935 #else // 64 bit variant
936
937 #define PIXOP2(OPNAME, OP) \
938 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
939 int i;\
940 for(i=0; i<h; i++){\
941 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
942 pixels+=line_size;\
943 block +=line_size;\
944 }\
945 }\
946 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
947 int i;\
948 for(i=0; i<h; i++){\
949 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
950 pixels+=line_size;\
951 block +=line_size;\
952 }\
953 }\
954 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
955 int i;\
956 for(i=0; i<h; i++){\
957 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
958 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
959 pixels+=line_size;\
960 block +=line_size;\
961 }\
962 }\
963 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
964 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
965 }\
966 \
967 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
968 int src_stride1, int src_stride2, int h){\
969 int i;\
970 for(i=0; i<h; i++){\
971 uint32_t a,b;\
972 a= AV_RN32(&src1[i*src_stride1 ]);\
973 b= AV_RN32(&src2[i*src_stride2 ]);\
974 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
975 a= AV_RN32(&src1[i*src_stride1+4]);\
976 b= AV_RN32(&src2[i*src_stride2+4]);\
977 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
978 }\
979 }\
980 \
981 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
982 int src_stride1, int src_stride2, int h){\
983 int i;\
984 for(i=0; i<h; i++){\
985 uint32_t a,b;\
986 a= AV_RN32(&src1[i*src_stride1 ]);\
987 b= AV_RN32(&src2[i*src_stride2 ]);\
988 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
989 a= AV_RN32(&src1[i*src_stride1+4]);\
990 b= AV_RN32(&src2[i*src_stride2+4]);\
991 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
992 }\
993 }\
994 \
995 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
996 int src_stride1, int src_stride2, int h){\
997 int i;\
998 for(i=0; i<h; i++){\
999 uint32_t a,b;\
1000 a= AV_RN32(&src1[i*src_stride1 ]);\
1001 b= AV_RN32(&src2[i*src_stride2 ]);\
1002 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1003 }\
1004 }\
1005 \
1006 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1007 int src_stride1, int src_stride2, int h){\
1008 int i;\
1009 for(i=0; i<h; i++){\
1010 uint32_t a,b;\
1011 a= AV_RN16(&src1[i*src_stride1 ]);\
1012 b= AV_RN16(&src2[i*src_stride2 ]);\
1013 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1014 }\
1015 }\
1016 \
1017 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1018 int src_stride1, int src_stride2, int h){\
1019 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1020 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1021 }\
1022 \
1023 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1024 int src_stride1, int src_stride2, int h){\
1025 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1026 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1027 }\
1028 \
1029 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1030 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1031 }\
1032 \
1033 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1034 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1035 }\
1036 \
1037 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1038 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1039 }\
1040 \
1041 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1042 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1043 }\
1044 \
1045 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1046 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1047 int i;\
1048 for(i=0; i<h; i++){\
1049 uint32_t a, b, c, d, l0, l1, h0, h1;\
1050 a= AV_RN32(&src1[i*src_stride1]);\
1051 b= AV_RN32(&src2[i*src_stride2]);\
1052 c= AV_RN32(&src3[i*src_stride3]);\
1053 d= AV_RN32(&src4[i*src_stride4]);\
1054 l0= (a&0x03030303UL)\
1055 + (b&0x03030303UL)\
1056 + 0x02020202UL;\
1057 h0= ((a&0xFCFCFCFCUL)>>2)\
1058 + ((b&0xFCFCFCFCUL)>>2);\
1059 l1= (c&0x03030303UL)\
1060 + (d&0x03030303UL);\
1061 h1= ((c&0xFCFCFCFCUL)>>2)\
1062 + ((d&0xFCFCFCFCUL)>>2);\
1063 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1064 a= AV_RN32(&src1[i*src_stride1+4]);\
1065 b= AV_RN32(&src2[i*src_stride2+4]);\
1066 c= AV_RN32(&src3[i*src_stride3+4]);\
1067 d= AV_RN32(&src4[i*src_stride4+4]);\
1068 l0= (a&0x03030303UL)\
1069 + (b&0x03030303UL)\
1070 + 0x02020202UL;\
1071 h0= ((a&0xFCFCFCFCUL)>>2)\
1072 + ((b&0xFCFCFCFCUL)>>2);\
1073 l1= (c&0x03030303UL)\
1074 + (d&0x03030303UL);\
1075 h1= ((c&0xFCFCFCFCUL)>>2)\
1076 + ((d&0xFCFCFCFCUL)>>2);\
1077 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1078 }\
1079 }\
1080 \
1081 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1082 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1083 }\
1084 \
1085 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1086 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1087 }\
1088 \
1089 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1090 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1091 }\
1092 \
1093 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1094 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1095 }\
1096 \
1097 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1098 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1099 int i;\
1100 for(i=0; i<h; i++){\
1101 uint32_t a, b, c, d, l0, l1, h0, h1;\
1102 a= AV_RN32(&src1[i*src_stride1]);\
1103 b= AV_RN32(&src2[i*src_stride2]);\
1104 c= AV_RN32(&src3[i*src_stride3]);\
1105 d= AV_RN32(&src4[i*src_stride4]);\
1106 l0= (a&0x03030303UL)\
1107 + (b&0x03030303UL)\
1108 + 0x01010101UL;\
1109 h0= ((a&0xFCFCFCFCUL)>>2)\
1110 + ((b&0xFCFCFCFCUL)>>2);\
1111 l1= (c&0x03030303UL)\
1112 + (d&0x03030303UL);\
1113 h1= ((c&0xFCFCFCFCUL)>>2)\
1114 + ((d&0xFCFCFCFCUL)>>2);\
1115 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1116 a= AV_RN32(&src1[i*src_stride1+4]);\
1117 b= AV_RN32(&src2[i*src_stride2+4]);\
1118 c= AV_RN32(&src3[i*src_stride3+4]);\
1119 d= AV_RN32(&src4[i*src_stride4+4]);\
1120 l0= (a&0x03030303UL)\
1121 + (b&0x03030303UL)\
1122 + 0x01010101UL;\
1123 h0= ((a&0xFCFCFCFCUL)>>2)\
1124 + ((b&0xFCFCFCFCUL)>>2);\
1125 l1= (c&0x03030303UL)\
1126 + (d&0x03030303UL);\
1127 h1= ((c&0xFCFCFCFCUL)>>2)\
1128 + ((d&0xFCFCFCFCUL)>>2);\
1129 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1130 }\
1131 }\
1132 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1133 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1134 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1135 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1136 }\
1137 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1138 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1139 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1140 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1141 }\
1142 \
1143 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1144 {\
1145 int i, a0, b0, a1, b1;\
1146 a0= pixels[0];\
1147 b0= pixels[1] + 2;\
1148 a0 += b0;\
1149 b0 += pixels[2];\
1150 \
1151 pixels+=line_size;\
1152 for(i=0; i<h; i+=2){\
1153 a1= pixels[0];\
1154 b1= pixels[1];\
1155 a1 += b1;\
1156 b1 += pixels[2];\
1157 \
1158 block[0]= (a1+a0)>>2; /* FIXME non put */\
1159 block[1]= (b1+b0)>>2;\
1160 \
1161 pixels+=line_size;\
1162 block +=line_size;\
1163 \
1164 a0= pixels[0];\
1165 b0= pixels[1] + 2;\
1166 a0 += b0;\
1167 b0 += pixels[2];\
1168 \
1169 block[0]= (a1+a0)>>2;\
1170 block[1]= (b1+b0)>>2;\
1171 pixels+=line_size;\
1172 block +=line_size;\
1173 }\
1174 }\
1175 \
1176 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1177 {\
1178 int i;\
1179 const uint32_t a= AV_RN32(pixels );\
1180 const uint32_t b= AV_RN32(pixels+1);\
1181 uint32_t l0= (a&0x03030303UL)\
1182 + (b&0x03030303UL)\
1183 + 0x02020202UL;\
1184 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1185 + ((b&0xFCFCFCFCUL)>>2);\
1186 uint32_t l1,h1;\
1187 \
1188 pixels+=line_size;\
1189 for(i=0; i<h; i+=2){\
1190 uint32_t a= AV_RN32(pixels );\
1191 uint32_t b= AV_RN32(pixels+1);\
1192 l1= (a&0x03030303UL)\
1193 + (b&0x03030303UL);\
1194 h1= ((a&0xFCFCFCFCUL)>>2)\
1195 + ((b&0xFCFCFCFCUL)>>2);\
1196 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1197 pixels+=line_size;\
1198 block +=line_size;\
1199 a= AV_RN32(pixels );\
1200 b= AV_RN32(pixels+1);\
1201 l0= (a&0x03030303UL)\
1202 + (b&0x03030303UL)\
1203 + 0x02020202UL;\
1204 h0= ((a&0xFCFCFCFCUL)>>2)\
1205 + ((b&0xFCFCFCFCUL)>>2);\
1206 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1207 pixels+=line_size;\
1208 block +=line_size;\
1209 }\
1210 }\
1211 \
1212 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1213 {\
1214 int j;\
1215 for(j=0; j<2; j++){\
1216 int i;\
1217 const uint32_t a= AV_RN32(pixels );\
1218 const uint32_t b= AV_RN32(pixels+1);\
1219 uint32_t l0= (a&0x03030303UL)\
1220 + (b&0x03030303UL)\
1221 + 0x02020202UL;\
1222 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1223 + ((b&0xFCFCFCFCUL)>>2);\
1224 uint32_t l1,h1;\
1225 \
1226 pixels+=line_size;\
1227 for(i=0; i<h; i+=2){\
1228 uint32_t a= AV_RN32(pixels );\
1229 uint32_t b= AV_RN32(pixels+1);\
1230 l1= (a&0x03030303UL)\
1231 + (b&0x03030303UL);\
1232 h1= ((a&0xFCFCFCFCUL)>>2)\
1233 + ((b&0xFCFCFCFCUL)>>2);\
1234 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1235 pixels+=line_size;\
1236 block +=line_size;\
1237 a= AV_RN32(pixels );\
1238 b= AV_RN32(pixels+1);\
1239 l0= (a&0x03030303UL)\
1240 + (b&0x03030303UL)\
1241 + 0x02020202UL;\
1242 h0= ((a&0xFCFCFCFCUL)>>2)\
1243 + ((b&0xFCFCFCFCUL)>>2);\
1244 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1245 pixels+=line_size;\
1246 block +=line_size;\
1247 }\
1248 pixels+=4-line_size*(h+1);\
1249 block +=4-line_size*h;\
1250 }\
1251 }\
1252 \
1253 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1254 {\
1255 int j;\
1256 for(j=0; j<2; j++){\
1257 int i;\
1258 const uint32_t a= AV_RN32(pixels );\
1259 const uint32_t b= AV_RN32(pixels+1);\
1260 uint32_t l0= (a&0x03030303UL)\
1261 + (b&0x03030303UL)\
1262 + 0x01010101UL;\
1263 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1264 + ((b&0xFCFCFCFCUL)>>2);\
1265 uint32_t l1,h1;\
1266 \
1267 pixels+=line_size;\
1268 for(i=0; i<h; i+=2){\
1269 uint32_t a= AV_RN32(pixels );\
1270 uint32_t b= AV_RN32(pixels+1);\
1271 l1= (a&0x03030303UL)\
1272 + (b&0x03030303UL);\
1273 h1= ((a&0xFCFCFCFCUL)>>2)\
1274 + ((b&0xFCFCFCFCUL)>>2);\
1275 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1276 pixels+=line_size;\
1277 block +=line_size;\
1278 a= AV_RN32(pixels );\
1279 b= AV_RN32(pixels+1);\
1280 l0= (a&0x03030303UL)\
1281 + (b&0x03030303UL)\
1282 + 0x01010101UL;\
1283 h0= ((a&0xFCFCFCFCUL)>>2)\
1284 + ((b&0xFCFCFCFCUL)>>2);\
1285 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1286 pixels+=line_size;\
1287 block +=line_size;\
1288 }\
1289 pixels+=4-line_size*(h+1);\
1290 block +=4-line_size*h;\
1291 }\
1292 }\
1293 \
1294 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1295 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1296 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1297 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1298 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1299 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1300 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1301 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1302
1303 #define op_avg(a, b) a = rnd_avg32(a, b)
1304 #endif
1305 #define op_put(a, b) a = b
1306
1307 PIXOP2(avg, op_avg)
1308 PIXOP2(put, op_put)
1309 #undef op_avg
1310 #undef op_put
1311
1312 #define avg2(a,b) ((a+b+1)>>1)
1313 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1314
1315 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1316 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1317 }
1318
1319 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1320 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1321 }
1322
1323 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1324 {
1325 const int A=(16-x16)*(16-y16);
1326 const int B=( x16)*(16-y16);
1327 const int C=(16-x16)*( y16);
1328 const int D=( x16)*( y16);
1329 int i;
1330
1331 for(i=0; i<h; i++)
1332 {
1333 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1334 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1335 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1336 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1337 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1338 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1339 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1340 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1341 dst+= stride;
1342 src+= stride;
1343 }
1344 }
1345
1346 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1347 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1348 {
1349 int y, vx, vy;
1350 const int s= 1<<shift;
1351
1352 width--;
1353 height--;
1354
1355 for(y=0; y<h; y++){
1356 int x;
1357
1358 vx= ox;
1359 vy= oy;
1360 for(x=0; x<8; x++){ //XXX FIXME optimize
1361 int src_x, src_y, frac_x, frac_y, index;
1362
1363 src_x= vx>>16;
1364 src_y= vy>>16;
1365 frac_x= src_x&(s-1);
1366 frac_y= src_y&(s-1);
1367 src_x>>=shift;
1368 src_y>>=shift;
1369
1370 if((unsigned)src_x < width){
1371 if((unsigned)src_y < height){
1372 index= src_x + src_y*stride;
1373 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1374 + src[index +1]* frac_x )*(s-frac_y)
1375 + ( src[index+stride ]*(s-frac_x)
1376 + src[index+stride+1]* frac_x )* frac_y
1377 + r)>>(shift*2);
1378 }else{
1379 index= src_x + av_clip(src_y, 0, height)*stride;
1380 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1381 + src[index +1]* frac_x )*s
1382 + r)>>(shift*2);
1383 }
1384 }else{
1385 if((unsigned)src_y < height){
1386 index= av_clip(src_x, 0, width) + src_y*stride;
1387 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1388 + src[index+stride ]* frac_y )*s
1389 + r)>>(shift*2);
1390 }else{
1391 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1392 dst[y*stride + x]= src[index ];
1393 }
1394 }
1395
1396 vx+= dxx;
1397 vy+= dyx;
1398 }
1399 ox += dxy;
1400 oy += dyy;
1401 }
1402 }
1403
1404 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405 switch(width){
1406 case 2: put_pixels2_c (dst, src, stride, height); break;
1407 case 4: put_pixels4_c (dst, src, stride, height); break;
1408 case 8: put_pixels8_c (dst, src, stride, height); break;
1409 case 16:put_pixels16_c(dst, src, stride, height); break;
1410 }
1411 }
1412
1413 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414 int i,j;
1415 for (i=0; i < height; i++) {
1416 for (j=0; j < width; j++) {
1417 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1418 }
1419 src += stride;
1420 dst += stride;
1421 }
1422 }
1423
1424 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425 int i,j;
1426 for (i=0; i < height; i++) {
1427 for (j=0; j < width; j++) {
1428 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1429 }
1430 src += stride;
1431 dst += stride;
1432 }
1433 }
1434
1435 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436 int i,j;
1437 for (i=0; i < height; i++) {
1438 for (j=0; j < width; j++) {
1439 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1440 }
1441 src += stride;
1442 dst += stride;
1443 }
1444 }
1445
1446 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447 int i,j;
1448 for (i=0; i < height; i++) {
1449 for (j=0; j < width; j++) {
1450 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1451 }
1452 src += stride;
1453 dst += stride;
1454 }
1455 }
1456
1457 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458 int i,j;
1459 for (i=0; i < height; i++) {
1460 for (j=0; j < width; j++) {
1461 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1462 }
1463 src += stride;
1464 dst += stride;
1465 }
1466 }
1467
1468 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469 int i,j;
1470 for (i=0; i < height; i++) {
1471 for (j=0; j < width; j++) {
1472 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1473 }
1474 src += stride;
1475 dst += stride;
1476 }
1477 }
1478
1479 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480 int i,j;
1481 for (i=0; i < height; i++) {
1482 for (j=0; j < width; j++) {
1483 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1484 }
1485 src += stride;
1486 dst += stride;
1487 }
1488 }
1489
1490 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491 int i,j;
1492 for (i=0; i < height; i++) {
1493 for (j=0; j < width; j++) {
1494 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1495 }
1496 src += stride;
1497 dst += stride;
1498 }
1499 }
1500
1501 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1502 switch(width){
1503 case 2: avg_pixels2_c (dst, src, stride, height); break;
1504 case 4: avg_pixels4_c (dst, src, stride, height); break;
1505 case 8: avg_pixels8_c (dst, src, stride, height); break;
1506 case 16:avg_pixels16_c(dst, src, stride, height); break;
1507 }
1508 }
1509
1510 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1511 int i,j;
1512 for (i=0; i < height; i++) {
1513 for (j=0; j < width; j++) {
1514 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1515 }
1516 src += stride;
1517 dst += stride;
1518 }
1519 }
1520
1521 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1522 int i,j;
1523 for (i=0; i < height; i++) {
1524 for (j=0; j < width; j++) {
1525 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1526 }
1527 src += stride;
1528 dst += stride;
1529 }
1530 }
1531
1532 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1533 int i,j;
1534 for (i=0; i < height; i++) {
1535 for (j=0; j < width; j++) {
1536 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1537 }
1538 src += stride;
1539 dst += stride;
1540 }
1541 }
1542
1543 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1544 int i,j;
1545 for (i=0; i < height; i++) {
1546 for (j=0; j < width; j++) {
1547 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1548 }
1549 src += stride;
1550 dst += stride;
1551 }
1552 }
1553
1554 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1555 int i,j;
1556 for (i=0; i < height; i++) {
1557 for (j=0; j < width; j++) {
1558 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1559 }
1560 src += stride;
1561 dst += stride;
1562 }
1563 }
1564
1565 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1566 int i,j;
1567 for (i=0; i < height; i++) {
1568 for (j=0; j < width; j++) {
1569 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1570 }
1571 src += stride;
1572 dst += stride;
1573 }
1574 }
1575
1576 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1577 int i,j;
1578 for (i=0; i < height; i++) {
1579 for (j=0; j < width; j++) {
1580 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1581 }
1582 src += stride;
1583 dst += stride;
1584 }
1585 }
1586
1587 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1588 int i,j;
1589 for (i=0; i < height; i++) {
1590 for (j=0; j < width; j++) {
1591 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1592 }
1593 src += stride;
1594 dst += stride;
1595 }
1596 }
1597 #if 0
1598 #define TPEL_WIDTH(width)\
1599 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1600 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1601 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1602 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1603 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1604 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1605 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1606 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1607 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1608 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1609 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1610 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1611 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1612 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1613 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1614 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1615 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1616 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1617 #endif
1618
1619 #define H264_CHROMA_MC(OPNAME, OP)\
1620 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1621 const int A=(8-x)*(8-y);\
1622 const int B=( x)*(8-y);\
1623 const int C=(8-x)*( y);\
1624 const int D=( x)*( y);\
1625 int i;\
1626 \
1627 assert(x<8 && y<8 && x>=0 && y>=0);\
1628 \
1629 if(D){\
1630 for(i=0; i<h; i++){\
1631 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1632 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1633 dst+= stride;\
1634 src+= stride;\
1635 }\
1636 }else{\
1637 const int E= B+C;\
1638 const int step= C ? stride : 1;\
1639 for(i=0; i<h; i++){\
1640 OP(dst[0], (A*src[0] + E*src[step+0]));\
1641 OP(dst[1], (A*src[1] + E*src[step+1]));\
1642 dst+= stride;\
1643 src+= stride;\
1644 }\
1645 }\
1646 }\
1647 \
1648 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1649 const int A=(8-x)*(8-y);\
1650 const int B=( x)*(8-y);\
1651 const int C=(8-x)*( y);\
1652 const int D=( x)*( y);\
1653 int i;\
1654 \
1655 assert(x<8 && y<8 && x>=0 && y>=0);\
1656 \
1657 if(D){\
1658 for(i=0; i<h; i++){\
1659 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1660 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1661 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1662 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1663 dst+= stride;\
1664 src+= stride;\
1665 }\
1666 }else{\
1667 const int E= B+C;\
1668 const int step= C ? stride : 1;\
1669 for(i=0; i<h; i++){\
1670 OP(dst[0], (A*src[0] + E*src[step+0]));\
1671 OP(dst[1], (A*src[1] + E*src[step+1]));\
1672 OP(dst[2], (A*src[2] + E*src[step+2]));\
1673 OP(dst[3], (A*src[3] + E*src[step+3]));\
1674 dst+= stride;\
1675 src+= stride;\
1676 }\
1677 }\
1678 }\
1679 \
1680 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1681 const int A=(8-x)*(8-y);\
1682 const int B=( x)*(8-y);\
1683 const int C=(8-x)*( y);\
1684 const int D=( x)*( y);\
1685 int i;\
1686 \
1687 assert(x<8 && y<8 && x>=0 && y>=0);\
1688 \
1689 if(D){\
1690 for(i=0; i<h; i++){\
1691 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1692 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1693 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1694 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1695 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1696 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1697 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1698 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1699 dst+= stride;\
1700 src+= stride;\
1701 }\
1702 }else{\
1703 const int E= B+C;\
1704 const int step= C ? stride : 1;\
1705 for(i=0; i<h; i++){\
1706 OP(dst[0], (A*src[0] + E*src[step+0]));\
1707 OP(dst[1], (A*src[1] + E*src[step+1]));\
1708 OP(dst[2], (A*src[2] + E*src[step+2]));\
1709 OP(dst[3], (A*src[3] + E*src[step+3]));\
1710 OP(dst[4], (A*src[4] + E*src[step+4]));\
1711 OP(dst[5], (A*src[5] + E*src[step+5]));\
1712 OP(dst[6], (A*src[6] + E*src[step+6]));\
1713 OP(dst[7], (A*src[7] + E*src[step+7]));\
1714 dst+= stride;\
1715 src+= stride;\
1716 }\
1717 }\
1718 }
1719
1720 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1721 #define op_put(a, b) a = (((b) + 32)>>6)
1722
1723 H264_CHROMA_MC(put_ , op_put)
1724 H264_CHROMA_MC(avg_ , op_avg)
1725 #undef op_avg
1726 #undef op_put
1727
1728 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1729 const int A=(8-x)*(8-y);
1730 const int B=( x)*(8-y);
1731 const int C=(8-x)*( y);
1732 const int D=( x)*( y);
1733 int i;
1734
1735 assert(x<8 && y<8 && x>=0 && y>=0);
1736
1737 for(i=0; i<h; i++)
1738 {
1739 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1740 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1741 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1742 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1743 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1744 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1745 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1746 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1747 dst+= stride;
1748 src+= stride;
1749 }
1750 }
1751
1752 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1753 const int A=(8-x)*(8-y);
1754 const int B=( x)*(8-y);
1755 const int C=(8-x)*( y);
1756 const int D=( x)*( y);
1757 int i;
1758
1759 assert(x<8 && y<8 && x>=0 && y>=0);
1760
1761 for(i=0; i<h; i++)
1762 {
1763 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1764 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1765 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1766 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1767 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1768 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1769 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1770 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1771 dst+= stride;
1772 src+= stride;
1773 }
1774 }
1775
1776 #define QPEL_MC(r, OPNAME, RND, OP) \
1777 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1778 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1779 int i;\
1780 for(i=0; i<h; i++)\
1781 {\
1782 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1783 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1784 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1785 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1786 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1787 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1788 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1789 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1790 dst+=dstStride;\
1791 src+=srcStride;\
1792 }\
1793 }\
1794 \
1795 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1796 const int w=8;\
1797 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1798 int i;\
1799 for(i=0; i<w; i++)\
1800 {\
1801 const int src0= src[0*srcStride];\
1802 const int src1= src[1*srcStride];\
1803 const int src2= src[2*srcStride];\
1804 const int src3= src[3*srcStride];\
1805 const int src4= src[4*srcStride];\
1806 const int src5= src[5*srcStride];\
1807 const int src6= src[6*srcStride];\
1808 const int src7= src[7*srcStride];\
1809 const int src8= src[8*srcStride];\
1810 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1811 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1812 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1813 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1814 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1815 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1816 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1817 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1818 dst++;\
1819 src++;\
1820 }\
1821 }\
1822 \
1823 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1824 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1825 int i;\
1826 \
1827 for(i=0; i<h; i++)\
1828 {\
1829 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1830 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1831 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1832 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1833 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1834 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1835 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1836 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1837 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1838 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1839 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1840 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1841 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1842 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1843 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1844 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1845 dst+=dstStride;\
1846 src+=srcStride;\
1847 }\
1848 }\
1849 \
1850 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1851 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1852 int i;\
1853 const int w=16;\
1854 for(i=0; i<w; i++)\
1855 {\
1856 const int src0= src[0*srcStride];\
1857 const int src1= src[1*srcStride];\
1858 const int src2= src[2*srcStride];\
1859 const int src3= src[3*srcStride];\
1860 const int src4= src[4*srcStride];\
1861 const int src5= src[5*srcStride];\
1862 const int src6= src[6*srcStride];\
1863 const int src7= src[7*srcStride];\
1864 const int src8= src[8*srcStride];\
1865 const int src9= src[9*srcStride];\
1866 const int src10= src[10*srcStride];\
1867 const int src11= src[11*srcStride];\
1868 const int src12= src[12*srcStride];\
1869 const int src13= src[13*srcStride];\
1870 const int src14= src[14*srcStride];\
1871 const int src15= src[15*srcStride];\
1872 const int src16= src[16*srcStride];\
1873 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1874 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1875 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1876 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1877 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1878 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1879 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1880 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1881 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1882 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1883 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1884 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1885 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1886 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1887 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1888 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1889 dst++;\
1890 src++;\
1891 }\
1892 }\
1893 \
1894 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1895 OPNAME ## pixels8_c(dst, src, stride, 8);\
1896 }\
1897 \
1898 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t half[64];\
1900 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1901 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1902 }\
1903 \
1904 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1905 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1906 }\
1907 \
1908 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1909 uint8_t half[64];\
1910 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1911 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1912 }\
1913 \
1914 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1915 uint8_t full[16*9];\
1916 uint8_t half[64];\
1917 copy_block9(full, src, 16, stride, 9);\
1918 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1919 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1920 }\
1921 \
1922 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1923 uint8_t full[16*9];\
1924 copy_block9(full, src, 16, stride, 9);\
1925 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1926 }\
1927 \
1928 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1929 uint8_t full[16*9];\
1930 uint8_t half[64];\
1931 copy_block9(full, src, 16, stride, 9);\
1932 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1933 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1934 }\
1935 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1936 uint8_t full[16*9];\
1937 uint8_t halfH[72];\
1938 uint8_t halfV[64];\
1939 uint8_t halfHV[64];\
1940 copy_block9(full, src, 16, stride, 9);\
1941 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1942 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1943 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1944 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1945 }\
1946 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1947 uint8_t full[16*9];\
1948 uint8_t halfH[72];\
1949 uint8_t halfHV[64];\
1950 copy_block9(full, src, 16, stride, 9);\
1951 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1952 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1953 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1955 }\
1956 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1957 uint8_t full[16*9];\
1958 uint8_t halfH[72];\
1959 uint8_t halfV[64];\
1960 uint8_t halfHV[64];\
1961 copy_block9(full, src, 16, stride, 9);\
1962 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1963 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1964 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1966 }\
1967 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1968 uint8_t full[16*9];\
1969 uint8_t halfH[72];\
1970 uint8_t halfHV[64];\
1971 copy_block9(full, src, 16, stride, 9);\
1972 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1973 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1974 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1975 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1976 }\
1977 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1978 uint8_t full[16*9];\
1979 uint8_t halfH[72];\
1980 uint8_t halfV[64];\
1981 uint8_t halfHV[64];\
1982 copy_block9(full, src, 16, stride, 9);\
1983 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1984 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1985 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1986 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1987 }\
1988 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1989 uint8_t full[16*9];\
1990 uint8_t halfH[72];\
1991 uint8_t halfHV[64];\
1992 copy_block9(full, src, 16, stride, 9);\
1993 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1994 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1995 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1996 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1997 }\
1998 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1999 uint8_t full[16*9];\
2000 uint8_t halfH[72];\
2001 uint8_t halfV[64];\
2002 uint8_t halfHV[64];\
2003 copy_block9(full, src, 16, stride, 9);\
2004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
2005 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2006 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2007 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2008 }\
2009 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2010 uint8_t full[16*9];\
2011 uint8_t halfH[72];\
2012 uint8_t halfHV[64];\
2013 copy_block9(full, src, 16, stride, 9);\
2014 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2015 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2016 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2017 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2018 }\
2019 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2020 uint8_t halfH[72];\
2021 uint8_t halfHV[64];\
2022 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2023 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2024 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
2025 }\
2026 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2027 uint8_t halfH[72];\
2028 uint8_t halfHV[64];\
2029 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2030 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2031 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2032 }\
2033 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[16*9];\
2035 uint8_t halfH[72];\
2036 uint8_t halfV[64];\
2037 uint8_t halfHV[64];\
2038 copy_block9(full, src, 16, stride, 9);\
2039 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2040 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2041 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2042 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2043 }\
2044 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2045 uint8_t full[16*9];\
2046 uint8_t halfH[72];\
2047 copy_block9(full, src, 16, stride, 9);\
2048 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2049 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2050 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2051 }\
2052 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2053 uint8_t full[16*9];\
2054 uint8_t halfH[72];\
2055 uint8_t halfV[64];\
2056 uint8_t halfHV[64];\
2057 copy_block9(full, src, 16, stride, 9);\
2058 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2059 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2060 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2061 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2062 }\
2063 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2064 uint8_t full[16*9];\
2065 uint8_t halfH[72];\
2066 copy_block9(full, src, 16, stride, 9);\
2067 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2068 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2069 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2070 }\
2071 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2072 uint8_t halfH[72];\
2073 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2074 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2075 }\
2076 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2077 OPNAME ## pixels16_c(dst, src, stride, 16);\
2078 }\
2079 \
2080 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2081 uint8_t half[256];\
2082 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2083 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2084 }\
2085 \
2086 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2087 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2088 }\
2089 \
2090 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2091 uint8_t half[256];\
2092 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2093 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2094 }\
2095 \
2096 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2097 uint8_t full[24*17];\
2098 uint8_t half[256];\
2099 copy_block17(full, src, 24, stride, 17);\
2100 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2101 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2102 }\
2103 \
2104 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2105 uint8_t full[24*17];\
2106 copy_block17(full, src, 24, stride, 17);\
2107 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2108 }\
2109 \
2110 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2111 uint8_t full[24*17];\
2112 uint8_t half[256];\
2113 copy_block17(full, src, 24, stride, 17);\
2114 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2115 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2116 }\
2117 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2118 uint8_t full[24*17];\
2119 uint8_t halfH[272];\
2120 uint8_t halfV[256];\
2121 uint8_t halfHV[256];\
2122 copy_block17(full, src, 24, stride, 17);\
2123 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2124 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2125 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2126 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2127 }\
2128 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2129 uint8_t full[24*17];\
2130 uint8_t halfH[272];\
2131 uint8_t halfHV[256];\
2132 copy_block17(full, src, 24, stride, 17);\
2133 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2134 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2135 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2137 }\
2138 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2139 uint8_t full[24*17];\
2140 uint8_t halfH[272];\
2141 uint8_t halfV[256];\
2142 uint8_t halfHV[256];\
2143 copy_block17(full, src, 24, stride, 17);\
2144 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2145 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2146 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2148 }\
2149 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2150 uint8_t full[24*17];\
2151 uint8_t halfH[272];\
2152 uint8_t halfHV[256];\
2153 copy_block17(full, src, 24, stride, 17);\
2154 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2155 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2156 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2157 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2158 }\
2159 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2160 uint8_t full[24*17];\
2161 uint8_t halfH[272];\
2162 uint8_t halfV[256];\
2163 uint8_t halfHV[256];\
2164 copy_block17(full, src, 24, stride, 17);\
2165 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2166 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2167 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2168 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2169 }\
2170 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2171 uint8_t full[24*17];\
2172 uint8_t halfH[272];\
2173 uint8_t halfHV[256];\
2174 copy_block17(full, src, 24, stride, 17);\
2175 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2176 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2177 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2178 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2179 }\
2180 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2181 uint8_t full[24*17];\
2182 uint8_t halfH[272];\
2183 uint8_t halfV[256];\
2184 uint8_t halfHV[256];\
2185 copy_block17(full, src, 24, stride, 17);\
2186 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2187 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2188 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2189 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2190 }\
2191 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2192 uint8_t full[24*17];\
2193 uint8_t halfH[272];\
2194 uint8_t halfHV[256];\
2195 copy_block17(full, src, 24, stride, 17);\
2196 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2197 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2198 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2199 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2200 }\
2201 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2202 uint8_t halfH[272];\
2203 uint8_t halfHV[256];\
2204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2205 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2206 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2207 }\
2208 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2209 uint8_t halfH[272];\
2210 uint8_t halfHV[256];\
2211 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2212 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2213 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2214 }\
2215 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2216 uint8_t full[24*17];\
2217 uint8_t halfH[272];\
2218 uint8_t halfV[256];\
2219 uint8_t halfHV[256];\
2220 copy_block17(full, src, 24, stride, 17);\
2221 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2222 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2223 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2224 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2225 }\
2226 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2227 uint8_t full[24*17];\
2228 uint8_t halfH[272];\
2229 copy_block17(full, src, 24, stride, 17);\
2230 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2231 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2232 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2233 }\
2234 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2235 uint8_t full[24*17];\
2236 uint8_t halfH[272];\
2237 uint8_t halfV[256];\
2238 uint8_t halfHV[256];\
2239 copy_block17(full, src, 24, stride, 17);\
2240 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2241 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2242 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2243 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2244 }\
2245 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2246 uint8_t full[24*17];\
2247 uint8_t halfH[272];\
2248 copy_block17(full, src, 24, stride, 17);\
2249 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2250 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2251 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2252 }\
2253 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2254 uint8_t halfH[272];\
2255 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2256 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2257 }
2258
2259 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2260 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2261 #define op_put(a, b) a = cm[((b) + 16)>>5]
2262 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2263
2264 QPEL_MC(0, put_ , _ , op_put)
2265 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2266 QPEL_MC(0, avg_ , _ , op_avg)
2267 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2268 #undef op_avg
2269 #undef op_avg_no_rnd
2270 #undef op_put
2271 #undef op_put_no_rnd
2272
2273 #if 1
2274 #define H264_LOWPASS(OPNAME, OP, OP2) \
2275 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2276 const int h=2;\
2277 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2278 int i;\
2279 for(i=0; i<h; i++)\
2280 {\
2281 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2282 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2283 dst+=dstStride;\
2284 src+=srcStride;\
2285 }\
2286 }\
2287 \
2288 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2289 const int w=2;\
2290 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2291 int i;\
2292 for(i=0; i<w; i++)\
2293 {\
2294 const int srcB= src[-2*srcStride];\
2295 const int srcA= src[-1*srcStride];\
2296 const int src0= src[0 *srcStride];\
2297 const int src1= src[1 *srcStride];\
2298 const int src2= src[2 *srcStride];\
2299 const int src3= src[3 *srcStride];\
2300 const int src4= src[4 *srcStride];\
2301 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2302 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2303 dst++;\
2304 src++;\
2305 }\
2306 }\
2307 \
2308 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2309 const int h=2;\
2310 const int w=2;\
2311 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2312 int i;\
2313 src -= 2*srcStride;\
2314 for(i=0; i<h+5; i++)\
2315 {\
2316 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2317 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2318 tmp+=tmpStride;\
2319 src+=srcStride;\
2320 }\
2321 tmp -= tmpStride*(h+5-2);\
2322 for(i=0; i<w; i++)\
2323 {\
2324 const int tmpB= tmp[-2*tmpStride];\
2325 const int tmpA= tmp[-1*tmpStride];\
2326 const int tmp0= tmp[0 *tmpStride];\
2327 const int tmp1= tmp[1 *tmpStride];\
2328 const int tmp2= tmp[2 *tmpStride];\
2329 const int tmp3= tmp[3 *tmpStride];\
2330 const int tmp4= tmp[4 *tmpStride];\
2331 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2332 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2333 dst++;\
2334 tmp++;\
2335 }\
2336 }\
2337 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2338 const int h=4;\
2339 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2340 int i;\
2341 for(i=0; i<h; i++)\
2342 {\
2343 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2344 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2345 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2346 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2347 dst+=dstStride;\
2348 src+=srcStride;\
2349 }\
2350 }\
2351 \
2352 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2353 const int w=4;\
2354 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2355 int i;\
2356 for(i=0; i<w; i++)\
2357 {\
2358 const int srcB= src[-2*srcStride];\
2359 const int srcA= src[-1*srcStride];\
2360 const int src0= src[0 *srcStride];\
2361 const int src1= src[1 *srcStride];\
2362 const int src2= src[2 *srcStride];\
2363 const int src3= src[3 *srcStride];\
2364 const int src4= src[4 *srcStride];\
2365 const int src5= src[5 *srcStride];\
2366 const int src6= src[6 *srcStride];\
2367 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2368 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2369 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2370 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2371 dst++;\
2372 src++;\
2373 }\
2374 }\
2375 \
2376 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2377 const int h=4;\
2378 const int w=4;\
2379 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2380 int i;\
2381 src -= 2*srcStride;\
2382 for(i=0; i<h+5; i++)\
2383 {\
2384 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2385 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2386 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2387 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2388 tmp+=tmpStride;\
2389 src+=srcStride;\
2390 }\
2391 tmp -= tmpStride*(h+5-2);\
2392 for(i=0; i<w; i++)\
2393 {\
2394 const int tmpB= tmp[-2*tmpStride];\
2395 const int tmpA= tmp[-1*tmpStride];\
2396 const int tmp0= tmp[0 *tmpStride];\
2397 const int tmp1= tmp[1 *tmpStride];\
2398 const int tmp2= tmp[2 *tmpStride];\
2399 const int tmp3= tmp[3 *tmpStride];\
2400 const int tmp4= tmp[4 *tmpStride];\
2401 const int tmp5= tmp[5 *tmpStride];\
2402 const int tmp6= tmp[6 *tmpStride];\
2403 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2404 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2405 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2406 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2407 dst++;\
2408 tmp++;\
2409 }\
2410 }\
2411 \
2412 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2413 const int h=8;\
2414 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2415 int i;\
2416 for(i=0; i<h; i++)\
2417 {\
2418 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2419 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2420 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2421 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2422 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2423 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2424 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2425 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2426 dst+=dstStride;\
2427 src+=srcStride;\
2428 }\
2429 }\
2430 \
2431 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2432 const int w=8;\
2433 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2434 int i;\
2435 for(i=0; i<w; i++)\
2436 {\
2437 const int srcB= src[-2*srcStride];\
2438 const int srcA= src[-1*srcStride];\
2439 const int src0= src[0 *srcStride];\
2440 const int src1= src[1 *srcStride];\
2441 const int src2= src[2 *srcStride];\
2442 const int src3= src[3 *srcStride];\
2443 const int src4= src[4 *srcStride];\
2444 const int src5= src[5 *srcStride];\
2445 const int src6= src[6 *srcStride];\
2446 const int src7= src[7 *srcStride];\
2447 const int src8= src[8 *srcStride];\
2448 const int src9= src[9 *srcStride];\
2449 const int src10=src[10*srcStride];\
2450 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2451 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2452 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2453 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2454 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2455 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2456 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2457 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2458 dst++;\
2459 src++;\
2460 }\
2461 }\
2462 \
2463 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2464 const int h=8;\
2465 const int w=8;\
2466 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2467 int i;\
2468 src -= 2*srcStride;\
2469 for(i=0; i<h+5; i++)\
2470 {\
2471 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2472 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2473 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2474 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2475 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2476 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2477 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2478 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2479 tmp+=tmpStride;\
2480 src+=srcStride;\
2481 }\
2482 tmp -= tmpStride*(h+5-2);\
2483 for(i=0; i<w; i++)\
2484 {\
2485 const int tmpB= tmp[-2*tmpStride];\
2486 const int tmpA= tmp[-1*tmpStride];\
2487 const int tmp0= tmp[0 *tmpStride];\
2488 const int tmp1= tmp[1 *tmpStride];\
2489 const int tmp2= tmp[2 *tmpStride];\
2490 const int tmp3= tmp[3 *tmpStride];\
2491 const int tmp4= tmp[4 *tmpStride];\
2492 const int tmp5= tmp[5 *tmpStride];\
2493 const int tmp6= tmp[6 *tmpStride];\
2494 const int tmp7= tmp[7 *tmpStride];\
2495 const int tmp8= tmp[8 *tmpStride];\
2496 const int tmp9= tmp[9 *tmpStride];\
2497 const int tmp10=tmp[10*tmpStride];\
2498 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2499 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2500 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2501 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2502 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2503 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2504 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2505 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2506 dst++;\
2507 tmp++;\
2508 }\
2509 }\
2510 \
2511 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2512 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2513 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2514 src += 8*srcStride;\
2515 dst += 8*dstStride;\
2516 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2517 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2518 }\
2519 \
2520 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2521 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2522 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2523 src += 8*srcStride;\
2524 dst += 8*dstStride;\
2525 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2526 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2527 }\
2528 \
2529 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2530 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2531 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2532 src += 8*srcStride;\
2533 dst += 8*dstStride;\
2534 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2535 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2536 }\
2537
2538 #define H264_MC(OPNAME, SIZE) \
2539 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2540 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2541 }\
2542 \
2543 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2544 uint8_t half[SIZE*SIZE];\
2545 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2546 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2547 }\
2548 \
2549 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2550 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2551 }\
2552 \
2553 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2554 uint8_t half[SIZE*SIZE];\
2555 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2556 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2557 }\
2558 \
2559 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2560 uint8_t full[SIZE*(SIZE+5)];\
2561 uint8_t * const full_mid= full + SIZE*2;\
2562 uint8_t half[SIZE*SIZE];\
2563 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2564 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2565 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2566 }\
2567 \
2568 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2569 uint8_t full[SIZE*(SIZE+5)];\
2570 uint8_t * const full_mid= full + SIZE*2;\
2571 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2572 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2573 }\
2574 \
2575 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2576 uint8_t full[SIZE*(SIZE+5)];\
2577 uint8_t * const full_mid= full + SIZE*2;\
2578 uint8_t half[SIZE*SIZE];\
2579 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2580 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2581 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2582 }\
2583 \
2584 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2585 uint8_t full[SIZE*(SIZE+5)];\
2586 uint8_t * const full_mid= full + SIZE*2;\
2587 uint8_t halfH[SIZE*SIZE];\
2588 uint8_t halfV[SIZE*SIZE];\
2589 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2590 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2591 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2592 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2593 }\
2594 \
2595 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2596 uint8_t full[SIZE*(SIZE+5)];\
2597 uint8_t * const full_mid= full + SIZE*2;\
2598 uint8_t halfH[SIZE*SIZE];\
2599 uint8_t halfV[SIZE*SIZE];\
2600 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2601 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2602 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2603 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2604 }\
2605 \
2606 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2607 uint8_t full[SIZE*(SIZE+5)];\
2608 uint8_t * const full_mid= full + SIZE*2;\
2609 uint8_t halfH[SIZE*SIZE];\
2610 uint8_t halfV[SIZE*SIZE];\
2611 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2612 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2613 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2614 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2615 }\
2616 \
2617 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2618 uint8_t full[SIZE*(SIZE+5)];\
2619 uint8_t * const full_mid= full + SIZE*2;\
2620 uint8_t halfH[SIZE*SIZE];\
2621 uint8_t halfV[SIZE*SIZE];\
2622 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2623 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2624 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2625 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2626 }\
2627 \
2628 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2629 int16_t tmp[SIZE*(SIZE+5)];\
2630 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2631 }\
2632 \
2633 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2634 int16_t tmp[SIZE*(SIZE+5)];\
2635 uint8_t halfH[SIZE*SIZE];\
2636 uint8_t halfHV[SIZE*SIZE];\
2637 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2638 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2639 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2640 }\
2641 \
2642 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2643 int16_t tmp[SIZE*(SIZE+5)];\
2644 uint8_t halfH[SIZE*SIZE];\
2645 uint8_t halfHV[SIZE*SIZE];\
2646 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2647 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2648 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2649 }\
2650 \
2651 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2652 uint8_t full[SIZE*(SIZE+5)];\
2653 uint8_t * const full_mid= full + SIZE*2;\
2654 int16_t tmp[SIZE*(SIZE+5)];\
2655 uint8_t halfV[SIZE*SIZE];\
2656 uint8_t halfHV[SIZE*SIZE];\
2657 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2658 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2659 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2660 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2661 }\
2662 \
2663 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2664 uint8_t full[SIZE*(SIZE+5)];\
2665 uint8_t * const full_mid= full + SIZE*2;\
2666 int16_t tmp[SIZE*(SIZE+5)];\
2667 uint8_t halfV[SIZE*SIZE];\
2668 uint8_t halfHV[SIZE*SIZE];\
2669 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2670 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2671 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2672 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2673 }\
2674
2675 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2676 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2677 #define op_put(a, b) a = cm[((b) + 16)>>5]
2678 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2679 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2680
2681 H264_LOWPASS(put_ , op_put, op2_put)
2682 H264_LOWPASS(avg_ , op_avg, op2_avg)
2683 H264_MC(put_, 2)
2684 H264_MC(put_, 4)
2685 H264_MC(put_, 8)
2686 H264_MC(put_, 16)
2687 H264_MC(avg_, 4)
2688 H264_MC(avg_, 8)
2689 H264_MC(avg_, 16)
2690
2691 #undef op_avg
2692 #undef op_put
2693 #undef op2_avg
2694 #undef op2_put
2695 #endif
2696
2697 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2698 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2699 #define H264_WEIGHT(W,H) \
2700 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2701 int y; \
2702 offset <<= log2_denom; \
2703 if(log2_denom) offset += 1<<(log2_denom-1); \
2704 for(y=0; y<H; y++, block += stride){ \
2705 op_scale1(0); \
2706 op_scale1(1); \
2707 if(W==2) continue; \
2708 op_scale1(2); \
2709 op_scale1(3); \
2710 if(W==4) continue; \
2711 op_scale1(4); \
2712 op_scale1(5); \
2713 op_scale1(6); \
2714 op_scale1(7); \
2715 if(W==8) continue; \
2716 op_scale1(8); \
2717 op_scale1(9); \
2718 op_scale1(10); \
2719 op_scale1(11); \
2720 op_scale1(12); \
2721 op_scale1(13); \
2722 op_scale1(14); \
2723 op_scale1(15); \
2724 } \
2725 } \
2726 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2727 int y; \
2728 offset = ((offset + 1) | 1) << log2_denom; \
2729 for(y=0; y<H; y++, dst += stride, src += stride){ \
2730 op_scale2(0); \
2731 op_scale2(1); \
2732 if(W==2) continue; \
2733 op_scale2(2); \
2734 op_scale2(3); \
2735 if(W==4) continue; \
2736 op_scale2(4); \
2737 op_scale2(5); \
2738 op_scale2(6); \
2739 op_scale2(7); \
2740 if(W==8) continue; \
2741 op_scale2(8); \
2742 op_scale2(9); \
2743 op_scale2(10); \
2744 op_scale2(11); \
2745 op_scale2(12); \
2746 op_scale2(13); \
2747 op_scale2(14); \
2748 op_scale2(15); \
2749 } \
2750 }
2751
2752 H264_WEIGHT(16,16)
2753 H264_WEIGHT(16,8)
2754 H264_WEIGHT(8,16)
2755 H264_WEIGHT(8,8)
2756 H264_WEIGHT(8,4)
2757 H264_WEIGHT(4,8)
2758 H264_WEIGHT(4,4)
2759 H264_WEIGHT(4,2)
2760 H264_WEIGHT(2,4)
2761 H264_WEIGHT(2,2)
2762
2763 #undef op_scale1
2764 #undef op_scale2
2765 #undef H264_WEIGHT
2766
2767 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2768 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2769 int i;
2770
2771 for(i=0; i<h; i++){
2772 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2773 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2774 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2775 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2776 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2777 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2778 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2779 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2780 dst+=dstStride;
2781 src+=srcStride;
2782 }
2783 }
2784
2785 #if CONFIG_CAVS_DECODER
2786 /* AVS specific */
2787 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2788 put_pixels8_c(dst, src, stride, 8);
2789 }
2790 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2791 avg_pixels8_c(dst, src, stride, 8);
2792 }
2793 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2794 put_pixels16_c(dst, src, stride, 16);
2795 }
2796 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2797 avg_pixels16_c(dst, src, stride, 16);
2798 }
2799 #endif /* CONFIG_CAVS_DECODER */
2800
2801 #if CONFIG_VC1_DECODER
2802 /* VC-1 specific */
2803 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2804 put_pixels8_c(dst, src, stride, 8);
2805 }
2806 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2807 avg_pixels8_c(dst, src, stride, 8);
2808 }
2809 #endif /* CONFIG_VC1_DECODER */
2810
2811 /* H264 specific */
2812 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2813
2814 #if CONFIG_RV40_DECODER
2815 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2816 put_pixels16_xy2_c(dst, src, stride, 16);
2817 }
2818 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2819 avg_pixels16_xy2_c(dst, src, stride, 16);
2820 }
2821 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2822 put_pixels8_xy2_c(dst, src, stride, 8);
2823 }
2824 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2825 avg_pixels8_xy2_c(dst, src, stride, 8);
2826 }
2827 #endif /* CONFIG_RV40_DECODER */
2828
2829 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2830 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2831 int i;
2832
2833 for(i=0; i<w; i++){
2834 const int src_1= src[ -srcStride];
2835 const int src0 = src[0 ];
2836 const int src1 = src[ srcStride];
2837 const int src2 = src[2*srcStride];
2838 const int src3 = src[3*srcStride];
2839 const int src4 = src[4*srcStride];
2840 const int src5 = src[5*srcStride];
2841 const int src6 = src[6*srcStride];
2842 const int src7 = src[7*srcStride];
2843 const int src8 = src[8*srcStride];
2844 const int src9 = src[9*srcStride];
2845 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2846 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2847 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2848 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2849 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2850 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2851 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2852 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2853 src++;
2854 dst++;
2855 }
2856 }
2857
2858 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2859 put_pixels8_c(dst, src, stride, 8);
2860 }
2861
2862 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2863 uint8_t half[64];
2864 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2865 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2866 }
2867
2868 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2869 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2870 }
2871
2872 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2873 uint8_t half[64];
2874 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2875 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2876 }
2877
2878 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2879 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2880 }
2881
2882 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2883 uint8_t halfH[88];
2884 uint8_t halfV[64];
2885 uint8_t halfHV[64];
2886 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2887 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2888 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2889 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2890 }
2891 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2892 uint8_t halfH[88];
2893 uint8_t halfV[64];
2894 uint8_t halfHV[64];
2895 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2896 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2897 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2898 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2899 }
2900 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2901 uint8_t halfH[88];
2902 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2903 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2904 }
2905
2906 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2907 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2908 int x;
2909 const int strength= ff_h263_loop_filter_strength[qscale];
2910
2911 for(x=0; x<8; x++){
2912 int d1, d2, ad1;
2913 int p0= src[x-2*stride];
2914 int p1= src[x-1*stride];
2915 int p2= src[x+0*stride];
2916 int p3= src[x+1*stride];
2917 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2918
2919 if (d<-2*strength) d1= 0;
2920 else if(d<- strength) d1=-2*strength - d;
2921 else if(d< strength) d1= d;
2922 else if(d< 2*strength) d1= 2*strength - d;
2923 else d1= 0;
2924
2925 p1 += d1;
2926 p2 -= d1;
2927 if(p1&256) p1= ~(p1>>31);
2928 if(p2&256) p2= ~(p2>>31);
2929
2930 src[x-1*stride] = p1;
2931 src[x+0*stride] = p2;
2932
2933 ad1= FFABS(d1)>>1;
2934
2935 d2= av_clip((p0-p3)/4, -ad1, ad1);
2936
2937 src[x-2*stride] = p0 - d2;
2938 src[x+ stride] = p3 + d2;
2939 }
2940 }
2941 }
2942
2943 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2944 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2945 int y;
2946 const int strength= ff_h263_loop_filter_strength[qscale];
2947
2948 for(y=0; y<8; y++){
2949 int d1, d2, ad1;
2950 int p0= src[y*stride-2];
2951 int p1= src[y*stride-1];
2952 int p2= src[y*stride+0];
2953 int p3= src[y*stride+1];
2954 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2955
2956 if (d<-2*strength) d1= 0;
2957 else if(d<- strength) d1=-2*strength - d;
2958 else if(d< strength) d1= d;
2959 else if(d< 2*strength) d1= 2*strength - d;
2960 else d1= 0;
2961
2962 p1 += d1;
2963 p2 -= d1;
2964 if(p1&256) p1= ~(p1>>31);
2965 if(p2&256) p2= ~(p2>>31);
2966
2967 src[y*stride-1] = p1;
2968 src[y*stride+0] = p2;
2969
2970 ad1= FFABS(d1)>>1;
2971
2972 d2= av_clip((p0-p3)/4, -ad1, ad1);
2973
2974 src[y*stride-2] = p0 - d2;
2975 src[y*stride+1] = p3 + d2;
2976 }
2977 }
2978 }
2979
2980 static void h261_loop_filter_c(uint8_t *src, int stride){
2981 int x,y,xy,yz;
2982 int temp[64];
2983
2984 for(x=0; x<8; x++){
2985 temp[x ] = 4*src[x ];
2986 temp[x + 7*8] = 4*src[x + 7*stride];
2987 }
2988 for(y=1; y<7; y++){
2989 for(x=0; x<8; x++){
2990 xy = y * stride + x;
2991 yz = y * 8 + x;
2992 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2993 }
2994 }
2995
2996 for(y=0; y<8; y++){
2997 src[ y*stride] = (temp[ y*8] + 2)>>2;
2998 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2999 for(x=1; x<7; x++){
3000 xy = y * stride + x;
3001 yz = y * 8 + x;
3002 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
3003 }
3004 }
3005 }
3006
3007 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3008 {
3009 int i, d;
3010 for( i = 0; i < 4; i++ ) {
3011 if( tc0[i] < 0 ) {
3012 pix += 4*ystride;
3013 continue;
3014 }
3015 for( d = 0; d < 4; d++ ) {
3016 const int p0 = pix[-1*xstride];
3017 const int p1 = pix[-2*xstride];
3018 const int p2 = pix[-3*xstride];
3019 const int q0 = pix[0];
3020 const int q1 = pix[1*xstride];
3021 const int q2 = pix[2*xstride];
3022
3023 if( FFABS( p0 - q0 ) < alpha &&
3024 FFABS( p1 - p0 ) < beta &&
3025 FFABS( q1 - q0 ) < beta ) {
3026
3027 int tc = tc0[i];
3028 int i_delta;
3029
3030 if( FFABS( p2 - p0 ) < beta ) {
3031 if(tc0[i])
3032 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3033 tc++;
3034 }
3035 if( FFABS( q2 - q0 ) < beta ) {
3036 if(tc0[i])
3037 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3038 tc++;
3039 }
3040
3041 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3042 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
3043 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
3044 }
3045 pix += ystride;
3046 }
3047 }
3048 }
3049 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3050 {
3051 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3052 }
3053 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3054 {
3055 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3056 }
3057
3058 static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3059 {
3060 int d;
3061 for( d = 0; d < 16; d++ ) {
3062 const int p2 = pix[-3*xstride];
3063 const int p1 = pix[-2*xstride];
3064 const int p0 = pix[-1*xstride];
3065
3066 const int q0 = pix[ 0*xstride];
3067 const int q1 = pix[ 1*xstride];
3068 const int q2 = pix[ 2*xstride];
3069
3070 if( FFABS( p0 - q0 ) < alpha &&
3071 FFABS( p1 - p0 ) < beta &&
3072 FFABS( q1 - q0 ) < beta ) {
3073
3074 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3075 if( FFABS( p2 - p0 ) < beta)
3076 {
3077 const int p3 = pix[-4*xstride];
3078 /* p0', p1', p2' */
3079 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3080 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3081 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3082 } else {
3083 /* p0' */
3084 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3085 }
3086 if( FFABS( q2 - q0 ) < beta)
3087 {
3088 const int q3 = pix[3*xstride];
3089 /* q0', q1', q2' */
3090 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3091 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3092 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3093 } else {
3094 /* q0' */
3095 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3096 }
3097 }else{
3098 /* p0', q0' */
3099 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3100 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3101 }
3102 }
3103 pix += ystride;
3104 }
3105 }
3106 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3107 {
3108 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3109 }
3110 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3111 {
3112 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3113 }
3114
3115 static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3116 {
3117 int i, d;
3118 for( i = 0; i < 4; i++ ) {
3119 const int tc = tc0[i];
3120 if( tc <= 0 ) {
3121 pix += 2*ystride;
3122 continue;
3123 }
3124 for( d = 0; d < 2; d++ ) {
3125 const int p0 = pix[-1*xstride];
3126 const int p1 = pix[-2*xstride];
3127 const int q0 = pix[0];
3128 const int q1 = pix[1*xstride];
3129
3130 if( FFABS( p0 - q0 ) < alpha &&
3131 FFABS( p1 - p0 ) < beta &&
3132 FFABS( q1 - q0 ) < beta ) {
3133
3134 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3135
3136 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3137 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3138 }
3139 pix += ystride;
3140 }
3141 }
3142 }
3143 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3144 {
3145 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3146 }
3147 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3148 {
3149 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3150 }
3151
3152 static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3153 {
3154 int d;
3155 for( d = 0; d < 8; d++ ) {
3156 const int p0 = pix[-1*xstride];
3157 const int p1 = pix[-2*xstride];
3158 const int q0 = pix[0];
3159 const int q1 = pix[1*xstride];
3160
3161 if( FFABS( p0 - q0 ) < alpha &&
3162 FFABS( p1 - p0 ) < beta &&
3163 FFABS( q1 - q0 ) < beta ) {
3164
3165 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3166 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3167 }
3168 pix += ystride;
3169 }
3170 }
3171 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3172 {
3173 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3174 }
3175 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3176 {
3177 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3178 }
3179
3180 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3181 {
3182 int s, i;
3183
3184 s = 0;
3185 for(i=0;i<h;i++) {
3186 s += abs(pix1[0] - pix2[0]);
3187 s += abs(pix1[1] - pix2[1]);
3188 s += abs(pix1[2] - pix2[2]);
3189 s += abs(pix1[3] - pix2[3]);
3190 s += abs(pix1[4] - pix2[4]);
3191 s += abs(pix1[5] - pix2[5]);
3192 s += abs(pix1[6] - pix2[6]);
3193 s += abs(pix1[7] - pix2[7]);
3194 s += abs(pix1[8] - pix2[8]);
3195 s += abs(pix1[9] - pix2[9]);
3196 s += abs(pix1[10] - pix2[10]);
3197 s += abs(pix1[11] - pix2[11]);
3198 s += abs(pix1[12] - pix2[12]);
3199 s += abs(pix1[13] - pix2[13]);
3200 s += abs(pix1[14] - pix2[14]);
3201 s += abs(pix1[15] - pix2[15]);
3202 pix1 += line_size;
3203 pix2 += line_size;
3204 }
3205 return s;
3206 }
3207
3208 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3209 {
3210 int s, i;
3211
3212 s = 0;
3213 for(i=0;i<h;i++) {
3214 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3215 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3216 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3217 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3218 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3219 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3220 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3221 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3222 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3223 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3224 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3225 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3226 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3227 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3228 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3229 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3230 pix1 += line_size;
3231 pix2 += line_size;
3232 }
3233 return s;
3234 }
3235
3236 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3237 {
3238 int s, i;
3239 uint8_t *pix3 = pix2 + line_size;
3240
3241 s = 0;
3242 for(i=0;i<h;i++) {
3243 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3244 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3245 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3246 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3247 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3248 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3249 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3250 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3251 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3252 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3253 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3254 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3255 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3256 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3257 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3258 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3259 pix1 += line_size;
3260 pix2 += line_size;
3261 pix3 += line_size;
3262 }
3263 return s;
3264 }
3265
3266 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3267 {
3268 int s, i;
3269 uint8_t *pix3 = pix2 + line_size;
3270
3271 s = 0;
3272 for(i=0;i<h;i++) {
3273 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3274 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3275 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3276 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3277 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3278 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3279 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3280 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3281 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3282 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3283 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3284 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3285 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3286 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3287 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3288 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3289 pix1 += line_size;
3290 pix2 += line_size;
3291 pix3 += line_size;
3292 }
3293 return s;
3294 }
3295
3296 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3297 {
3298 int s, i;
3299
3300 s = 0;
3301 for(i=0;i<h;i++) {
3302 s += abs(pix1[0] - pix2[0]);
3303 s += abs(pix1[1] - pix2[1]);
3304 s += abs(pix1[2] - pix2[2]);
3305 s += abs(pix1[3] - pix2[3]);
3306 s += abs(pix1[4] - pix2[4]);
3307 s += abs(pix1[5] - pix2[5]);
3308 s += abs(pix1[6] - pix2[6]);
3309 s += abs(pix1[7] - pix2[7]);
3310 pix1 += line_size;
3311 pix2 += line_size;
3312 }
3313 return s;
3314 }
3315
3316 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3317 {
3318 int s, i;
3319
3320 s = 0;
3321 for(i=0;i<h;i++) {
3322 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3323 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3324 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3325 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3326 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3327 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3328 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3329 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3330 pix1 += line_size;
3331 pix2 += line_size;
3332 }
3333 return s;
3334 }
3335
3336 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3337 {
3338 int s, i;
3339 uint8_t *pix3 = pix2 + line_size;
3340
3341 s = 0;
3342 for(i=0;i<h;i++) {
3343 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3344 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3345 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3346 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3347 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3348 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3349 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3350 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3351 pix1 += line_size;
3352 pix2 += line_size;
3353 pix3 += line_size;
3354 }
3355 return s;
3356 }
3357
3358 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3359 {
3360 int s, i;
3361 uint8_t *pix3 = pix2 + line_size;
3362
3363 s = 0;
3364 for(i=0;i<h;i++) {
3365 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3366 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3367 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3368 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3369 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3370 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3371 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3372 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3373 pix1 += line_size;
3374 pix2 += line_size;
3375 pix3 += line_size;
3376 }
3377 return s;
3378 }
3379
3380 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3381 MpegEncContext *c = v;
3382 int score1=0;
3383 int score2=0;
3384 int x,y;
3385
3386 for(y=0; y<h; y++){
3387 for(x=0; x<16; x++){
3388 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3389 }
3390 if(y+1<h){
3391 for(x=0; x<15; x++){
3392 score2+= FFABS( s1[x ] - s1[x +stride]
3393 - s1[x+1] + s1[x+1+stride])
3394 -FFABS( s2[x ] - s2[x +stride]
3395 - s2[x+1] + s2[x+1+stride]);
3396 }
3397 }
3398 s1+= stride;
3399 s2+= stride;
3400 }
3401
3402 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3403 else return score1 + FFABS(score2)*8;
3404 }
3405
3406 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3407 MpegEncContext *c = v;
3408 int score1=0;
3409 int score2=0;
3410 int x,y;
3411
3412 for(y=0; y<h; y++){
3413 for(x=0; x<8; x++){
3414 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3415 }
3416 if(y+1<h){
3417 for(x=0; x<7; x++){
3418 score2+= FFABS( s1[x ] - s1[x +stride]
3419 - s1[x+1] + s1[x+1+stride])
3420 -FFABS( s2[x ] - s2[x +stride]
3421 - s2[x+1] + s2[x+1+stride]);
3422 }
3423 }
3424 s1+= stride;
3425 s2+= stride;
3426 }
3427
3428 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3429 else return score1 + FFABS(score2)*8;
3430 }
3431
3432 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3433 int i;
3434 unsigned int sum=0;
3435
3436 for(i=0; i<8*8; i++){
3437 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3438 int w= weight[i];
3439 b>>= RECON_SHIFT;
3440 assert(-512<b && b<512);
3441
3442 sum += (w*b)*(w*b)>>4;
3443 }
3444 return sum>>2;
3445 }
3446
3447 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3448 int i;
3449
3450 for(i=0