Move some VC1 dsp prototypes to dsputil.h; they are defined in dsputil.c
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /**
26 * @file libavcodec/dsputil.c
27 * DSP utils
28 */
29
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "mathops.h"
36 #include "snow.h"
37 #include "mpegvideo.h"
38 #include "config.h"
39 #include "lpc.h"
40 #include "ac3dec.h"
41 #include "vorbis.h"
42 #include "png.h"
43
44 /* snow.c */
45 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
46
47 /* eaidct.c */
48 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
49
50 /* binkidct.c */
51 void ff_bink_idct_c (DCTELEM *block);
52 void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
53 void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
54
55 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
56 uint32_t ff_squareTbl[512] = {0, };
57
58 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
59 #define pb_7f (~0UL/255 * 0x7f)
60 #define pb_80 (~0UL/255 * 0x80)
61
62 const uint8_t ff_zigzag_direct[64] = {
63 0, 1, 8, 16, 9, 2, 3, 10,
64 17, 24, 32, 25, 18, 11, 4, 5,
65 12, 19, 26, 33, 40, 48, 41, 34,
66 27, 20, 13, 6, 7, 14, 21, 28,
67 35, 42, 49, 56, 57, 50, 43, 36,
68 29, 22, 15, 23, 30, 37, 44, 51,
69 58, 59, 52, 45, 38, 31, 39, 46,
70 53, 60, 61, 54, 47, 55, 62, 63
71 };
72
73 /* Specific zigzag scan for 248 idct. NOTE that unlike the
74 specification, we interleave the fields */
75 const uint8_t ff_zigzag248_direct[64] = {
76 0, 8, 1, 9, 16, 24, 2, 10,
77 17, 25, 32, 40, 48, 56, 33, 41,
78 18, 26, 3, 11, 4, 12, 19, 27,
79 34, 42, 49, 57, 50, 58, 35, 43,
80 20, 28, 5, 13, 6, 14, 21, 29,
81 36, 44, 51, 59, 52, 60, 37, 45,
82 22, 30, 7, 15, 23, 31, 38, 46,
83 53, 61, 54, 62, 39, 47, 55, 63,
84 };
85
86 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
87 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
88
89 const uint8_t ff_alternate_horizontal_scan[64] = {
90 0, 1, 2, 3, 8, 9, 16, 17,
91 10, 11, 4, 5, 6, 7, 15, 14,
92 13, 12, 19, 18, 24, 25, 32, 33,
93 26, 27, 20, 21, 22, 23, 28, 29,
94 30, 31, 34, 35, 40, 41, 48, 49,
95 42, 43, 36, 37, 38, 39, 44, 45,
96 46, 47, 50, 51, 56, 57, 58, 59,
97 52, 53, 54, 55, 60, 61, 62, 63,
98 };
99
100 const uint8_t ff_alternate_vertical_scan[64] = {
101 0, 8, 16, 24, 1, 9, 2, 10,
102 17, 25, 32, 40, 48, 56, 57, 49,
103 41, 33, 26, 18, 3, 11, 4, 12,
104 19, 27, 34, 42, 50, 58, 35, 43,
105 51, 59, 20, 28, 5, 13, 6, 14,
106 21, 29, 36, 44, 52, 60, 37, 45,
107 53, 61, 22, 30, 7, 15, 23, 31,
108 38, 46, 54, 62, 39, 47, 55, 63,
109 };
110
111 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
112 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
113 const uint32_t ff_inverse[257]={
114 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
115 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
116 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
117 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
118 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
119 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
120 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
121 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
122 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
123 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
124 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
125 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
126 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
127 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
128 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
129 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
130 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
131 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
132 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
133 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
134 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
135 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
136 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
137 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
138 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
139 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
140 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
141 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
142 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
143 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
144 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
145 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
146 16777216
147 };
148
149 /* Input permutation for the simple_idct_mmx */
150 static const uint8_t simple_mmx_permutation[64]={
151 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
159 };
160
161 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162
163 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
164 int i;
165 int end;
166
167 st->scantable= src_scantable;
168
169 for(i=0; i<64; i++){
170 int j;
171 j = src_scantable[i];
172 st->permutated[i] = permutation[j];
173 #if ARCH_PPC
174 st->inverse[j] = i;
175 #endif
176 }
177
178 end=-1;
179 for(i=0; i<64; i++){
180 int j;
181 j = st->permutated[i];
182 if(j>end) end=j;
183 st->raster_end[i]= end;
184 }
185 }
186
187 static int pix_sum_c(uint8_t * pix, int line_size)
188 {
189 int s, i, j;
190
191 s = 0;
192 for (i = 0; i < 16; i++) {
193 for (j = 0; j < 16; j += 8) {
194 s += pix[0];
195 s += pix[1];
196 s += pix[2];
197 s += pix[3];
198 s += pix[4];
199 s += pix[5];
200 s += pix[6];
201 s += pix[7];
202 pix += 8;
203 }
204 pix += line_size - 16;
205 }
206 return s;
207 }
208
209 static int pix_norm1_c(uint8_t * pix, int line_size)
210 {
211 int s, i, j;
212 uint32_t *sq = ff_squareTbl + 256;
213
214 s = 0;
215 for (i = 0; i < 16; i++) {
216 for (j = 0; j < 16; j += 8) {
217 #if 0
218 s += sq[pix[0]];
219 s += sq[pix[1]];
220 s += sq[pix[2]];
221 s += sq[pix[3]];
222 s += sq[pix[4]];
223 s += sq[pix[5]];
224 s += sq[pix[6]];
225 s += sq[pix[7]];
226 #else
227 #if LONG_MAX > 2147483647
228 register uint64_t x=*(uint64_t*)pix;
229 s += sq[x&0xff];
230 s += sq[(x>>8)&0xff];
231 s += sq[(x>>16)&0xff];
232 s += sq[(x>>24)&0xff];
233 s += sq[(x>>32)&0xff];
234 s += sq[(x>>40)&0xff];
235 s += sq[(x>>48)&0xff];
236 s += sq[(x>>56)&0xff];
237 #else
238 register uint32_t x=*(uint32_t*)pix;
239 s += sq[x&0xff];
240 s += sq[(x>>8)&0xff];
241 s += sq[(x>>16)&0xff];
242 s += sq[(x>>24)&0xff];
243 x=*(uint32_t*)(pix+4);
244 s += sq[x&0xff];
245 s += sq[(x>>8)&0xff];
246 s += sq[(x>>16)&0xff];
247 s += sq[(x>>24)&0xff];
248 #endif
249 #endif
250 pix += 8;
251 }
252 pix += line_size - 16;
253 }
254 return s;
255 }
256
257 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
258 int i;
259
260 for(i=0; i+8<=w; i+=8){
261 dst[i+0]= bswap_32(src[i+0]);
262 dst[i+1]= bswap_32(src[i+1]);
263 dst[i+2]= bswap_32(src[i+2]);
264 dst[i+3]= bswap_32(src[i+3]);
265 dst[i+4]= bswap_32(src[i+4]);
266 dst[i+5]= bswap_32(src[i+5]);
267 dst[i+6]= bswap_32(src[i+6]);
268 dst[i+7]= bswap_32(src[i+7]);
269 }
270 for(;i<w; i++){
271 dst[i+0]= bswap_32(src[i+0]);
272 }
273 }
274
275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
276 {
277 int s, i;
278 uint32_t *sq = ff_squareTbl + 256;
279
280 s = 0;
281 for (i = 0; i < h; i++) {
282 s += sq[pix1[0] - pix2[0]];
283 s += sq[pix1[1] - pix2[1]];
284 s += sq[pix1[2] - pix2[2]];
285 s += sq[pix1[3] - pix2[3]];
286 pix1 += line_size;
287 pix2 += line_size;
288 }
289 return s;
290 }
291
292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
293 {
294 int s, i;
295 uint32_t *sq = ff_squareTbl + 256;
296
297 s = 0;
298 for (i = 0; i < h; i++) {
299 s += sq[pix1[0] - pix2[0]];
300 s += sq[pix1[1] - pix2[1]];
301 s += sq[pix1[2] - pix2[2]];
302 s += sq[pix1[3] - pix2[3]];
303 s += sq[pix1[4] - pix2[4]];
304 s += sq[pix1[5] - pix2[5]];
305 s += sq[pix1[6] - pix2[6]];
306 s += sq[pix1[7] - pix2[7]];
307 pix1 += line_size;
308 pix2 += line_size;
309 }
310 return s;
311 }
312
313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
314 {
315 int s, i;
316 uint32_t *sq = ff_squareTbl + 256;
317
318 s = 0;
319 for (i = 0; i < h; i++) {
320 s += sq[pix1[ 0] - pix2[ 0]];
321 s += sq[pix1[ 1] - pix2[ 1]];
322 s += sq[pix1[ 2] - pix2[ 2]];
323 s += sq[pix1[ 3] - pix2[ 3]];
324 s += sq[pix1[ 4] - pix2[ 4]];
325 s += sq[pix1[ 5] - pix2[ 5]];
326 s += sq[pix1[ 6] - pix2[ 6]];
327 s += sq[pix1[ 7] - pix2[ 7]];
328 s += sq[pix1[ 8] - pix2[ 8]];
329 s += sq[pix1[ 9] - pix2[ 9]];
330 s += sq[pix1[10] - pix2[10]];
331 s += sq[pix1[11] - pix2[11]];
332 s += sq[pix1[12] - pix2[12]];
333 s += sq[pix1[13] - pix2[13]];
334 s += sq[pix1[14] - pix2[14]];
335 s += sq[pix1[15] - pix2[15]];
336
337 pix1 += line_size;
338 pix2 += line_size;
339 }
340 return s;
341 }
342
343
344 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
345 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
346 int s, i, j;
347 const int dec_count= w==8 ? 3 : 4;
348 int tmp[32*32];
349 int level, ori;
350 static const int scale[2][2][4][4]={
351 {
352 {
353 // 9/7 8x8 dec=3
354 {268, 239, 239, 213},
355 { 0, 224, 224, 152},
356 { 0, 135, 135, 110},
357 },{
358 // 9/7 16x16 or 32x32 dec=4
359 {344, 310, 310, 280},
360 { 0, 320, 320, 228},
361 { 0, 175, 175, 136},
362 { 0, 129, 129, 102},
363 }
364 },{
365 {
366 // 5/3 8x8 dec=3
367 {275, 245, 245, 218},
368 { 0, 230, 230, 156},
369 { 0, 138, 138, 113},
370 },{
371 // 5/3 16x16 or 32x32 dec=4
372 {352, 317, 317, 286},
373 { 0, 328, 328, 233},
374 { 0, 180, 180, 140},
375 { 0, 132, 132, 105},
376 }
377 }
378 };
379
380 for (i = 0; i < h; i++) {
381 for (j = 0; j < w; j+=4) {
382 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
383 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
384 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
385 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
386 }
387 pix1 += line_size;
388 pix2 += line_size;
389 }
390
391 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
392
393 s=0;
394 assert(w==h);
395 for(level=0; level<dec_count; level++){
396 for(ori= level ? 1 : 0; ori<4; ori++){
397 int size= w>>(dec_count-level);
398 int sx= (ori&1) ? size : 0;
399 int stride= 32<<(dec_count-level);
400 int sy= (ori&2) ? stride>>1 : 0;
401
402 for(i=0; i<size; i++){
403 for(j=0; j<size; j++){
404 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
405 s += FFABS(v);
406 }
407 }
408 }
409 }
410 assert(s>=0);
411 return s>>9;
412 }
413
414 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415 return w_c(v, pix1, pix2, line_size, 8, h, 1);
416 }
417
418 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419 return w_c(v, pix1, pix2, line_size, 8, h, 0);
420 }
421
422 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423 return w_c(v, pix1, pix2, line_size, 16, h, 1);
424 }
425
426 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427 return w_c(v, pix1, pix2, line_size, 16, h, 0);
428 }
429
430 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431 return w_c(v, pix1, pix2, line_size, 32, h, 1);
432 }
433
434 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435 return w_c(v, pix1, pix2, line_size, 32, h, 0);
436 }
437 #endif
438
439 /* draw the edges of width 'w' of an image of size width, height */
440 //FIXME check that this is ok for mpeg4 interlaced
441 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
442 {
443 uint8_t *ptr, *last_line;
444 int i;
445
446 last_line = buf + (height - 1) * wrap;
447 for(i=0;i<w;i++) {
448 /* top and bottom */
449 memcpy(buf - (i + 1) * wrap, buf, width);
450 memcpy(last_line + (i + 1) * wrap, last_line, width);
451 }
452 /* left and right */
453 ptr = buf;
454 for(i=0;i<height;i++) {
455 memset(ptr - w, ptr[0], w);
456 memset(ptr + width, ptr[width-1], w);
457 ptr += wrap;
458 }
459 /* corners */
460 for(i=0;i<w;i++) {
461 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
462 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
463 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
464 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
465 }
466 }
467
468 /**
469 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
470 * @param buf destination buffer
471 * @param src source buffer
472 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
473 * @param block_w width of block
474 * @param block_h height of block
475 * @param src_x x coordinate of the top left sample of the block in the source buffer
476 * @param src_y y coordinate of the top left sample of the block in the source buffer
477 * @param w width of the source buffer
478 * @param h height of the source buffer
479 */
480 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
481 int src_x, int src_y, int w, int h){
482 int x, y;
483 int start_y, start_x, end_y, end_x;
484
485 if(src_y>= h){
486 src+= (h-1-src_y)*linesize;
487 src_y=h-1;
488 }else if(src_y<=-block_h){
489 src+= (1-block_h-src_y)*linesize;
490 src_y=1-block_h;
491 }
492 if(src_x>= w){
493 src+= (w-1-src_x);
494 src_x=w-1;
495 }else if(src_x<=-block_w){
496 src+= (1-block_w-src_x);
497 src_x=1-block_w;
498 }
499
500 start_y= FFMAX(0, -src_y);
501 start_x= FFMAX(0, -src_x);
502 end_y= FFMIN(block_h, h-src_y);
503 end_x= FFMIN(block_w, w-src_x);
504
505 // copy existing part
506 for(y=start_y; y<end_y; y++){
507 for(x=start_x; x<end_x; x++){
508 buf[x + y*linesize]= src[x + y*linesize];
509 }
510 }
511
512 //top
513 for(y=0; y<start_y; y++){
514 for(x=start_x; x<end_x; x++){
515 buf[x + y*linesize]= buf[x + start_y*linesize];
516 }
517 }
518
519 //bottom
520 for(y=end_y; y<block_h; y++){
521 for(x=start_x; x<end_x; x++){
522 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
523 }
524 }
525
526 for(y=0; y<block_h; y++){
527 //left
528 for(x=0; x<start_x; x++){
529 buf[x + y*linesize]= buf[start_x + y*linesize];
530 }
531
532 //right
533 for(x=end_x; x<block_w; x++){
534 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
535 }
536 }
537 }
538
539 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
540 {
541 int i;
542
543 /* read the pixels */
544 for(i=0;i<8;i++) {
545 block[0] = pixels[0];
546 block[1] = pixels[1];
547 block[2] = pixels[2];
548 block[3] = pixels[3];
549 block[4] = pixels[4];
550 block[5] = pixels[5];
551 block[6] = pixels[6];
552 block[7] = pixels[7];
553 pixels += line_size;
554 block += 8;
555 }
556 }
557
558 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
559 const uint8_t *s2, int stride){
560 int i;
561
562 /* read the pixels */
563 for(i=0;i<8;i++) {
564 block[0] = s1[0] - s2[0];
565 block[1] = s1[1] - s2[1];
566 block[2] = s1[2] - s2[2];
567 block[3] = s1[3] - s2[3];
568 block[4] = s1[4] - s2[4];
569 block[5] = s1[5] - s2[5];
570 block[6] = s1[6] - s2[6];
571 block[7] = s1[7] - s2[7];
572 s1 += stride;
573 s2 += stride;
574 block += 8;
575 }
576 }
577
578
579 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
580 int line_size)
581 {
582 int i;
583 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
584
585 /* read the pixels */
586 for(i=0;i<8;i++) {
587 pixels[0] = cm[block[0]];
588 pixels[1] = cm[block[1]];
589 pixels[2] = cm[block[2]];
590 pixels[3] = cm[block[3]];
591 pixels[4] = cm[block[4]];
592 pixels[5] = cm[block[5]];
593 pixels[6] = cm[block[6]];
594 pixels[7] = cm[block[7]];
595
596 pixels += line_size;
597 block += 8;
598 }
599 }
600
601 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
602 int line_size)
603 {
604 int i;
605 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
606
607 /* read the pixels */
608 for(i=0;i<4;i++) {
609 pixels[0] = cm[block[0]];
610 pixels[1] = cm[block[1]];
611 pixels[2] = cm[block[2]];
612 pixels[3] = cm[block[3]];
613
614 pixels += line_size;
615 block += 8;
616 }
617 }
618
619 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
620 int line_size)
621 {
622 int i;
623 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
624
625 /* read the pixels */
626 for(i=0;i<2;i++) {
627 pixels[0] = cm[block[0]];
628 pixels[1] = cm[block[1]];
629
630 pixels += line_size;
631 block += 8;
632 }
633 }
634
635 static void put_signed_pixels_clamped_c(const DCTELEM *block,
636 uint8_t *restrict pixels,
637 int line_size)
638 {
639 int i, j;
640
641 for (i = 0; i < 8; i++) {
642 for (j = 0; j < 8; j++) {
643 if (*block < -128)
644 *pixels = 0;
645 else if (*block > 127)
646 *pixels = 255;
647 else
648 *pixels = (uint8_t)(*block + 128);
649 block++;
650 pixels++;
651 }
652 pixels += (line_size - 8);
653 }
654 }
655
656 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
657 int line_size)
658 {
659 int i;
660
661 /* read the pixels */
662 for(i=0;i<8;i++) {
663 pixels[0] = block[0];
664 pixels[1] = block[1];
665 pixels[2] = block[2];
666 pixels[3] = block[3];
667 pixels[4] = block[4];
668 pixels[5] = block[5];
669 pixels[6] = block[6];
670 pixels[7] = block[7];
671
672 pixels += line_size;
673 block += 8;
674 }
675 }
676
677 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
678 int line_size)
679 {
680 int i;
681 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
682
683 /* read the pixels */
684 for(i=0;i<8;i++) {
685 pixels[0] = cm[pixels[0] + block[0]];
686 pixels[1] = cm[pixels[1] + block[1]];
687 pixels[2] = cm[pixels[2] + block[2]];
688 pixels[3] = cm[pixels[3] + block[3]];
689 pixels[4] = cm[pixels[4] + block[4]];
690 pixels[5] = cm[pixels[5] + block[5]];
691 pixels[6] = cm[pixels[6] + block[6]];
692 pixels[7] = cm[pixels[7] + block[7]];
693 pixels += line_size;
694 block += 8;
695 }
696 }
697
698 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
699 int line_size)
700 {
701 int i;
702 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
703
704 /* read the pixels */
705 for(i=0;i<4;i++) {
706 pixels[0] = cm[pixels[0] + block[0]];
707 pixels[1] = cm[pixels[1] + block[1]];
708 pixels[2] = cm[pixels[2] + block[2]];
709 pixels[3] = cm[pixels[3] + block[3]];
710 pixels += line_size;
711 block += 8;
712 }
713 }
714
715 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
716 int line_size)
717 {
718 int i;
719 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
720
721 /* read the pixels */
722 for(i=0;i<2;i++) {
723 pixels[0] = cm[pixels[0] + block[0]];
724 pixels[1] = cm[pixels[1] + block[1]];
725 pixels += line_size;
726 block += 8;
727 }
728 }
729
730 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
731 {
732 int i;
733 for(i=0;i<8;i++) {
734 pixels[0] += block[0];
735 pixels[1] += block[1];
736 pixels[2] += block[2];
737 pixels[3] += block[3];
738 pixels[4] += block[4];
739 pixels[5] += block[5];
740 pixels[6] += block[6];
741 pixels[7] += block[7];
742 pixels += line_size;
743 block += 8;
744 }
745 }
746
747 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
748 {
749 int i;
750 for(i=0;i<4;i++) {
751 pixels[0] += block[0];
752 pixels[1] += block[1];
753 pixels[2] += block[2];
754 pixels[3] += block[3];
755 pixels += line_size;
756 block += 4;
757 }
758 }
759
760 static int sum_abs_dctelem_c(DCTELEM *block)
761 {
762 int sum=0, i;
763 for(i=0; i<64; i++)
764 sum+= FFABS(block[i]);
765 return sum;
766 }
767
768 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
769 {
770 int i;
771
772 for (i = 0; i < h; i++) {
773 memset(block, value, 16);
774 block += line_size;
775 }
776 }
777
778 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
779 {
780 int i;
781
782 for (i = 0; i < h; i++) {
783 memset(block, value, 8);
784 block += line_size;
785 }
786 }
787
788 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
789 {
790 int i, j;
791 uint16_t *dst1 = dst;
792 uint16_t *dst2 = dst + linesize;
793
794 for (j = 0; j < 8; j++) {
795 for (i = 0; i < 8; i++) {
796 dst1[i] = dst2[i] = src[i] * 0x0101;
797 }
798 src += 8;
799 dst1 += linesize;
800 dst2 += linesize;
801 }
802 }
803
804 #if 0
805
806 #define PIXOP2(OPNAME, OP) \
807 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
808 {\
809 int i;\
810 for(i=0; i<h; i++){\
811 OP(*((uint64_t*)block), AV_RN64(pixels));\
812 pixels+=line_size;\
813 block +=line_size;\
814 }\
815 }\
816 \
817 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
818 {\
819 int i;\
820 for(i=0; i<h; i++){\
821 const uint64_t a= AV_RN64(pixels );\
822 const uint64_t b= AV_RN64(pixels+1);\
823 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
824 pixels+=line_size;\
825 block +=line_size;\
826 }\
827 }\
828 \
829 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
830 {\
831 int i;\
832 for(i=0; i<h; i++){\
833 const uint64_t a= AV_RN64(pixels );\
834 const uint64_t b= AV_RN64(pixels+1);\
835 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
836 pixels+=line_size;\
837 block +=line_size;\
838 }\
839 }\
840 \
841 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
842 {\
843 int i;\
844 for(i=0; i<h; i++){\
845 const uint64_t a= AV_RN64(pixels );\
846 const uint64_t b= AV_RN64(pixels+line_size);\
847 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
848 pixels+=line_size;\
849 block +=line_size;\
850 }\
851 }\
852 \
853 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
854 {\
855 int i;\
856 for(i=0; i<h; i++){\
857 const uint64_t a= AV_RN64(pixels );\
858 const uint64_t b= AV_RN64(pixels+line_size);\
859 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
860 pixels+=line_size;\
861 block +=line_size;\
862 }\
863 }\
864 \
865 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
866 {\
867 int i;\
868 const uint64_t a= AV_RN64(pixels );\
869 const uint64_t b= AV_RN64(pixels+1);\
870 uint64_t l0= (a&0x0303030303030303ULL)\
871 + (b&0x0303030303030303ULL)\
872 + 0x0202020202020202ULL;\
873 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
874 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
875 uint64_t l1,h1;\
876 \
877 pixels+=line_size;\
878 for(i=0; i<h; i+=2){\
879 uint64_t a= AV_RN64(pixels );\
880 uint64_t b= AV_RN64(pixels+1);\
881 l1= (a&0x0303030303030303ULL)\
882 + (b&0x0303030303030303ULL);\
883 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
884 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
885 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
886 pixels+=line_size;\
887 block +=line_size;\
888 a= AV_RN64(pixels );\
889 b= AV_RN64(pixels+1);\
890 l0= (a&0x0303030303030303ULL)\
891 + (b&0x0303030303030303ULL)\
892 + 0x0202020202020202ULL;\
893 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
894 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
895 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
896 pixels+=line_size;\
897 block +=line_size;\
898 }\
899 }\
900 \
901 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
902 {\
903 int i;\
904 const uint64_t a= AV_RN64(pixels );\
905 const uint64_t b= AV_RN64(pixels+1);\
906 uint64_t l0= (a&0x0303030303030303ULL)\
907 + (b&0x0303030303030303ULL)\
908 + 0x0101010101010101ULL;\
909 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
910 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
911 uint64_t l1,h1;\
912 \
913 pixels+=line_size;\
914 for(i=0; i<h; i+=2){\
915 uint64_t a= AV_RN64(pixels );\
916 uint64_t b= AV_RN64(pixels+1);\
917 l1= (a&0x0303030303030303ULL)\
918 + (b&0x0303030303030303ULL);\
919 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
920 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
921 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
922 pixels+=line_size;\
923 block +=line_size;\
924 a= AV_RN64(pixels );\
925 b= AV_RN64(pixels+1);\
926 l0= (a&0x0303030303030303ULL)\
927 + (b&0x0303030303030303ULL)\
928 + 0x0101010101010101ULL;\
929 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
930 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
931 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
932 pixels+=line_size;\
933 block +=line_size;\
934 }\
935 }\
936 \
937 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
938 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
939 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
940 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
941 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
942 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
943 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
944
945 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
946 #else // 64 bit variant
947
948 #define PIXOP2(OPNAME, OP) \
949 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
950 int i;\
951 for(i=0; i<h; i++){\
952 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
953 pixels+=line_size;\
954 block +=line_size;\
955 }\
956 }\
957 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
958 int i;\
959 for(i=0; i<h; i++){\
960 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
961 pixels+=line_size;\
962 block +=line_size;\
963 }\
964 }\
965 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
966 int i;\
967 for(i=0; i<h; i++){\
968 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
969 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
970 pixels+=line_size;\
971 block +=line_size;\
972 }\
973 }\
974 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
975 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
976 }\
977 \
978 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
979 int src_stride1, int src_stride2, int h){\
980 int i;\
981 for(i=0; i<h; i++){\
982 uint32_t a,b;\
983 a= AV_RN32(&src1[i*src_stride1 ]);\
984 b= AV_RN32(&src2[i*src_stride2 ]);\
985 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
986 a= AV_RN32(&src1[i*src_stride1+4]);\
987 b= AV_RN32(&src2[i*src_stride2+4]);\
988 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
989 }\
990 }\
991 \
992 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
993 int src_stride1, int src_stride2, int h){\
994 int i;\
995 for(i=0; i<h; i++){\
996 uint32_t a,b;\
997 a= AV_RN32(&src1[i*src_stride1 ]);\
998 b= AV_RN32(&src2[i*src_stride2 ]);\
999 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1000 a= AV_RN32(&src1[i*src_stride1+4]);\
1001 b= AV_RN32(&src2[i*src_stride2+4]);\
1002 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
1003 }\
1004 }\
1005 \
1006 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1007 int src_stride1, int src_stride2, int h){\
1008 int i;\
1009 for(i=0; i<h; i++){\
1010 uint32_t a,b;\
1011 a= AV_RN32(&src1[i*src_stride1 ]);\
1012 b= AV_RN32(&src2[i*src_stride2 ]);\
1013 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1014 }\
1015 }\
1016 \
1017 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1018 int src_stride1, int src_stride2, int h){\
1019 int i;\
1020 for(i=0; i<h; i++){\
1021 uint32_t a,b;\
1022 a= AV_RN16(&src1[i*src_stride1 ]);\
1023 b= AV_RN16(&src2[i*src_stride2 ]);\
1024 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1025 }\
1026 }\
1027 \
1028 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1029 int src_stride1, int src_stride2, int h){\
1030 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1031 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1032 }\
1033 \
1034 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1035 int src_stride1, int src_stride2, int h){\
1036 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1037 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1038 }\
1039 \
1040 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1041 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1042 }\
1043 \
1044 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1045 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1046 }\
1047 \
1048 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1049 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1050 }\
1051 \
1052 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1053 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1054 }\
1055 \
1056 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1057 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1058 int i;\
1059 for(i=0; i<h; i++){\
1060 uint32_t a, b, c, d, l0, l1, h0, h1;\
1061 a= AV_RN32(&src1[i*src_stride1]);\
1062 b= AV_RN32(&src2[i*src_stride2]);\
1063 c= AV_RN32(&src3[i*src_stride3]);\
1064 d= AV_RN32(&src4[i*src_stride4]);\
1065 l0= (a&0x03030303UL)\
1066 + (b&0x03030303UL)\
1067 + 0x02020202UL;\
1068 h0= ((a&0xFCFCFCFCUL)>>2)\
1069 + ((b&0xFCFCFCFCUL)>>2);\
1070 l1= (c&0x03030303UL)\
1071 + (d&0x03030303UL);\
1072 h1= ((c&0xFCFCFCFCUL)>>2)\
1073 + ((d&0xFCFCFCFCUL)>>2);\
1074 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1075 a= AV_RN32(&src1[i*src_stride1+4]);\
1076 b= AV_RN32(&src2[i*src_stride2+4]);\
1077 c= AV_RN32(&src3[i*src_stride3+4]);\
1078 d= AV_RN32(&src4[i*src_stride4+4]);\
1079 l0= (a&0x03030303UL)\
1080 + (b&0x03030303UL)\
1081 + 0x02020202UL;\
1082 h0= ((a&0xFCFCFCFCUL)>>2)\
1083 + ((b&0xFCFCFCFCUL)>>2);\
1084 l1= (c&0x03030303UL)\
1085 + (d&0x03030303UL);\
1086 h1= ((c&0xFCFCFCFCUL)>>2)\
1087 + ((d&0xFCFCFCFCUL)>>2);\
1088 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1089 }\
1090 }\
1091 \
1092 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1093 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1094 }\
1095 \
1096 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1097 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1098 }\
1099 \
1100 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1101 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1102 }\
1103 \
1104 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1105 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1106 }\
1107 \
1108 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1109 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1110 int i;\
1111 for(i=0; i<h; i++){\
1112 uint32_t a, b, c, d, l0, l1, h0, h1;\
1113 a= AV_RN32(&src1[i*src_stride1]);\
1114 b= AV_RN32(&src2[i*src_stride2]);\
1115 c= AV_RN32(&src3[i*src_stride3]);\
1116 d= AV_RN32(&src4[i*src_stride4]);\
1117 l0= (a&0x03030303UL)\
1118 + (b&0x03030303UL)\
1119 + 0x01010101UL;\
1120 h0= ((a&0xFCFCFCFCUL)>>2)\
1121 + ((b&0xFCFCFCFCUL)>>2);\
1122 l1= (c&0x03030303UL)\
1123 + (d&0x03030303UL);\
1124 h1= ((c&0xFCFCFCFCUL)>>2)\
1125 + ((d&0xFCFCFCFCUL)>>2);\
1126 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1127 a= AV_RN32(&src1[i*src_stride1+4]);\
1128 b= AV_RN32(&src2[i*src_stride2+4]);\
1129 c= AV_RN32(&src3[i*src_stride3+4]);\
1130 d= AV_RN32(&src4[i*src_stride4+4]);\
1131 l0= (a&0x03030303UL)\
1132 + (b&0x03030303UL)\
1133 + 0x01010101UL;\
1134 h0= ((a&0xFCFCFCFCUL)>>2)\
1135 + ((b&0xFCFCFCFCUL)>>2);\
1136 l1= (c&0x03030303UL)\
1137 + (d&0x03030303UL);\
1138 h1= ((c&0xFCFCFCFCUL)>>2)\
1139 + ((d&0xFCFCFCFCUL)>>2);\
1140 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1141 }\
1142 }\
1143 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1144 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1145 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1146 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1147 }\
1148 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1149 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1150 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1151 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1152 }\
1153 \
1154 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1155 {\
1156 int i, a0, b0, a1, b1;\
1157 a0= pixels[0];\
1158 b0= pixels[1] + 2;\
1159 a0 += b0;\
1160 b0 += pixels[2];\
1161 \
1162 pixels+=line_size;\
1163 for(i=0; i<h; i+=2){\
1164 a1= pixels[0];\
1165 b1= pixels[1];\
1166 a1 += b1;\
1167 b1 += pixels[2];\
1168 \
1169 block[0]= (a1+a0)>>2; /* FIXME non put */\
1170 block[1]= (b1+b0)>>2;\
1171 \
1172 pixels+=line_size;\
1173 block +=line_size;\
1174 \
1175 a0= pixels[0];\
1176 b0= pixels[1] + 2;\
1177 a0 += b0;\
1178 b0 += pixels[2];\
1179 \
1180 block[0]= (a1+a0)>>2;\
1181 block[1]= (b1+b0)>>2;\
1182 pixels+=line_size;\
1183 block +=line_size;\
1184 }\
1185 }\
1186 \
1187 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1188 {\
1189 int i;\
1190 const uint32_t a= AV_RN32(pixels );\
1191 const uint32_t b= AV_RN32(pixels+1);\
1192 uint32_t l0= (a&0x03030303UL)\
1193 + (b&0x03030303UL)\
1194 + 0x02020202UL;\
1195 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1196 + ((b&0xFCFCFCFCUL)>>2);\
1197 uint32_t l1,h1;\
1198 \
1199 pixels+=line_size;\
1200 for(i=0; i<h; i+=2){\
1201 uint32_t a= AV_RN32(pixels );\
1202 uint32_t b= AV_RN32(pixels+1);\
1203 l1= (a&0x03030303UL)\
1204 + (b&0x03030303UL);\
1205 h1= ((a&0xFCFCFCFCUL)>>2)\
1206 + ((b&0xFCFCFCFCUL)>>2);\
1207 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1208 pixels+=line_size;\
1209 block +=line_size;\
1210 a= AV_RN32(pixels );\
1211 b= AV_RN32(pixels+1);\
1212 l0= (a&0x03030303UL)\
1213 + (b&0x03030303UL)\
1214 + 0x02020202UL;\
1215 h0= ((a&0xFCFCFCFCUL)>>2)\
1216 + ((b&0xFCFCFCFCUL)>>2);\
1217 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1218 pixels+=line_size;\
1219 block +=line_size;\
1220 }\
1221 }\
1222 \
1223 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1224 {\
1225 int j;\
1226 for(j=0; j<2; j++){\
1227 int i;\
1228 const uint32_t a= AV_RN32(pixels );\
1229 const uint32_t b= AV_RN32(pixels+1);\
1230 uint32_t l0= (a&0x03030303UL)\
1231 + (b&0x03030303UL)\
1232 + 0x02020202UL;\
1233 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1234 + ((b&0xFCFCFCFCUL)>>2);\
1235 uint32_t l1,h1;\
1236 \
1237 pixels+=line_size;\
1238 for(i=0; i<h; i+=2){\
1239 uint32_t a= AV_RN32(pixels );\
1240 uint32_t b= AV_RN32(pixels+1);\
1241 l1= (a&0x03030303UL)\
1242 + (b&0x03030303UL);\
1243 h1= ((a&0xFCFCFCFCUL)>>2)\
1244 + ((b&0xFCFCFCFCUL)>>2);\
1245 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1246 pixels+=line_size;\
1247 block +=line_size;\
1248 a= AV_RN32(pixels );\
1249 b= AV_RN32(pixels+1);\
1250 l0= (a&0x03030303UL)\
1251 + (b&0x03030303UL)\
1252 + 0x02020202UL;\
1253 h0= ((a&0xFCFCFCFCUL)>>2)\
1254 + ((b&0xFCFCFCFCUL)>>2);\
1255 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1256 pixels+=line_size;\
1257 block +=line_size;\
1258 }\
1259 pixels+=4-line_size*(h+1);\
1260 block +=4-line_size*h;\
1261 }\
1262 }\
1263 \
1264 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1265 {\
1266 int j;\
1267 for(j=0; j<2; j++){\
1268 int i;\
1269 const uint32_t a= AV_RN32(pixels );\
1270 const uint32_t b= AV_RN32(pixels+1);\
1271 uint32_t l0= (a&0x03030303UL)\
1272 + (b&0x03030303UL)\
1273 + 0x01010101UL;\
1274 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1275 + ((b&0xFCFCFCFCUL)>>2);\
1276 uint32_t l1,h1;\
1277 \
1278 pixels+=line_size;\
1279 for(i=0; i<h; i+=2){\
1280 uint32_t a= AV_RN32(pixels );\
1281 uint32_t b= AV_RN32(pixels+1);\
1282 l1= (a&0x03030303UL)\
1283 + (b&0x03030303UL);\
1284 h1= ((a&0xFCFCFCFCUL)>>2)\
1285 + ((b&0xFCFCFCFCUL)>>2);\
1286 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1287 pixels+=line_size;\
1288 block +=line_size;\
1289 a= AV_RN32(pixels );\
1290 b= AV_RN32(pixels+1);\
1291 l0= (a&0x03030303UL)\
1292 + (b&0x03030303UL)\
1293 + 0x01010101UL;\
1294 h0= ((a&0xFCFCFCFCUL)>>2)\
1295 + ((b&0xFCFCFCFCUL)>>2);\
1296 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1297 pixels+=line_size;\
1298 block +=line_size;\
1299 }\
1300 pixels+=4-line_size*(h+1);\
1301 block +=4-line_size*h;\
1302 }\
1303 }\
1304 \
1305 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1306 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1307 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1308 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1309 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1310 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1311 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1312 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1313
1314 #define op_avg(a, b) a = rnd_avg32(a, b)
1315 #endif
1316 #define op_put(a, b) a = b
1317
1318 PIXOP2(avg, op_avg)
1319 PIXOP2(put, op_put)
1320 #undef op_avg
1321 #undef op_put
1322
1323 #define avg2(a,b) ((a+b+1)>>1)
1324 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1325
1326 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1327 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1328 }
1329
1330 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1331 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1332 }
1333
1334 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1335 {
1336 const int A=(16-x16)*(16-y16);
1337 const int B=( x16)*(16-y16);
1338 const int C=(16-x16)*( y16);
1339 const int D=( x16)*( y16);
1340 int i;
1341
1342 for(i=0; i<h; i++)
1343 {
1344 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1345 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1346 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1347 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1348 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1349 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1350 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1351 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1352 dst+= stride;
1353 src+= stride;
1354 }
1355 }
1356
1357 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1358 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1359 {
1360 int y, vx, vy;
1361 const int s= 1<<shift;
1362
1363 width--;
1364 height--;
1365
1366 for(y=0; y<h; y++){
1367 int x;
1368
1369 vx= ox;
1370 vy= oy;
1371 for(x=0; x<8; x++){ //XXX FIXME optimize
1372 int src_x, src_y, frac_x, frac_y, index;
1373
1374 src_x= vx>>16;
1375 src_y= vy>>16;
1376 frac_x= src_x&(s-1);
1377 frac_y= src_y&(s-1);
1378 src_x>>=shift;
1379 src_y>>=shift;
1380
1381 if((unsigned)src_x < width){
1382 if((unsigned)src_y < height){
1383 index= src_x + src_y*stride;
1384 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1385 + src[index +1]* frac_x )*(s-frac_y)
1386 + ( src[index+stride ]*(s-frac_x)
1387 + src[index+stride+1]* frac_x )* frac_y
1388 + r)>>(shift*2);
1389 }else{
1390 index= src_x + av_clip(src_y, 0, height)*stride;
1391 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1392 + src[index +1]* frac_x )*s
1393 + r)>>(shift*2);
1394 }
1395 }else{
1396 if((unsigned)src_y < height){
1397 index= av_clip(src_x, 0, width) + src_y*stride;
1398 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1399 + src[index+stride ]* frac_y )*s
1400 + r)>>(shift*2);
1401 }else{
1402 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1403 dst[y*stride + x]= src[index ];
1404 }
1405 }
1406
1407 vx+= dxx;
1408 vy+= dyx;
1409 }
1410 ox += dxy;
1411 oy += dyy;
1412 }
1413 }
1414
1415 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1416 switch(width){
1417 case 2: put_pixels2_c (dst, src, stride, height); break;
1418 case 4: put_pixels4_c (dst, src, stride, height); break;
1419 case 8: put_pixels8_c (dst, src, stride, height); break;
1420 case 16:put_pixels16_c(dst, src, stride, height); break;
1421 }
1422 }
1423
1424 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425 int i,j;
1426 for (i=0; i < height; i++) {
1427 for (j=0; j < width; j++) {
1428 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1429 }
1430 src += stride;
1431 dst += stride;
1432 }
1433 }
1434
1435 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436 int i,j;
1437 for (i=0; i < height; i++) {
1438 for (j=0; j < width; j++) {
1439 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1440 }
1441 src += stride;
1442 dst += stride;
1443 }
1444 }
1445
1446 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447 int i,j;
1448 for (i=0; i < height; i++) {
1449 for (j=0; j < width; j++) {
1450 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1451 }
1452 src += stride;
1453 dst += stride;
1454 }
1455 }
1456
1457 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458 int i,j;
1459 for (i=0; i < height; i++) {
1460 for (j=0; j < width; j++) {
1461 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1462 }
1463 src += stride;
1464 dst += stride;
1465 }
1466 }
1467
1468 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469 int i,j;
1470 for (i=0; i < height; i++) {
1471 for (j=0; j < width; j++) {
1472 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1473 }
1474 src += stride;
1475 dst += stride;
1476 }
1477 }
1478
1479 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480 int i,j;
1481 for (i=0; i < height; i++) {
1482 for (j=0; j < width; j++) {
1483 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1484 }
1485 src += stride;
1486 dst += stride;
1487 }
1488 }
1489
1490 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491 int i,j;
1492 for (i=0; i < height; i++) {
1493 for (j=0; j < width; j++) {
1494 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1495 }
1496 src += stride;
1497 dst += stride;
1498 }
1499 }
1500
1501 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1502 int i,j;
1503 for (i=0; i < height; i++) {
1504 for (j=0; j < width; j++) {
1505 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1506 }
1507 src += stride;
1508 dst += stride;
1509 }
1510 }
1511
1512 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1513 switch(width){
1514 case 2: avg_pixels2_c (dst, src, stride, height); break;
1515 case 4: avg_pixels4_c (dst, src, stride, height); break;
1516 case 8: avg_pixels8_c (dst, src, stride, height); break;
1517 case 16:avg_pixels16_c(dst, src, stride, height); break;
1518 }
1519 }
1520
1521 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1522 int i,j;
1523 for (i=0; i < height; i++) {
1524 for (j=0; j < width; j++) {
1525 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1526 }
1527 src += stride;
1528 dst += stride;
1529 }
1530 }
1531
1532 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1533 int i,j;
1534 for (i=0; i < height; i++) {
1535 for (j=0; j < width; j++) {
1536 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1537 }
1538 src += stride;
1539 dst += stride;
1540 }
1541 }
1542
1543 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1544 int i,j;
1545 for (i=0; i < height; i++) {
1546 for (j=0; j < width; j++) {
1547 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1548 }
1549 src += stride;
1550 dst += stride;
1551 }
1552 }
1553
1554 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1555 int i,j;
1556 for (i=0; i < height; i++) {
1557 for (j=0; j < width; j++) {
1558 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1559 }
1560 src += stride;
1561 dst += stride;
1562 }
1563 }
1564
1565 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1566 int i,j;
1567 for (i=0; i < height; i++) {
1568 for (j=0; j < width; j++) {
1569 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1570 }
1571 src += stride;
1572 dst += stride;
1573 }
1574 }
1575
1576 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1577 int i,j;
1578 for (i=0; i < height; i++) {
1579 for (j=0; j < width; j++) {
1580 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1581 }
1582 src += stride;
1583 dst += stride;
1584 }
1585 }
1586
1587 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1588 int i,j;
1589 for (i=0; i < height; i++) {
1590 for (j=0; j < width; j++) {
1591 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1592 }
1593 src += stride;
1594 dst += stride;
1595 }
1596 }
1597
1598 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1599 int i,j;
1600 for (i=0; i < height; i++) {
1601 for (j=0; j < width; j++) {
1602 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1603 }
1604 src += stride;
1605 dst += stride;
1606 }
1607 }
1608 #if 0
1609 #define TPEL_WIDTH(width)\
1610 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1611 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1612 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1613 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1614 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1615 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1616 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1617 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1618 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1619 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1620 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1621 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1622 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1623 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1624 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1625 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1626 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1627 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1628 #endif
1629
1630 #define H264_CHROMA_MC(OPNAME, OP)\
1631 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1632 const int A=(8-x)*(8-y);\
1633 const int B=( x)*(8-y);\
1634 const int C=(8-x)*( y);\
1635 const int D=( x)*( y);\
1636 int i;\
1637 \
1638 assert(x<8 && y<8 && x>=0 && y>=0);\
1639 \
1640 if(D){\
1641 for(i=0; i<h; i++){\
1642 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1643 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1644 dst+= stride;\
1645 src+= stride;\
1646 }\
1647 }else{\
1648 const int E= B+C;\
1649 const int step= C ? stride : 1;\
1650 for(i=0; i<h; i++){\
1651 OP(dst[0], (A*src[0] + E*src[step+0]));\
1652 OP(dst[1], (A*src[1] + E*src[step+1]));\
1653 dst+= stride;\
1654 src+= stride;\
1655 }\
1656 }\
1657 }\
1658 \
1659 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1660 const int A=(8-x)*(8-y);\
1661 const int B=( x)*(8-y);\
1662 const int C=(8-x)*( y);\
1663 const int D=( x)*( y);\
1664 int i;\
1665 \
1666 assert(x<8 && y<8 && x>=0 && y>=0);\
1667 \
1668 if(D){\
1669 for(i=0; i<h; i++){\
1670 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1671 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1672 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1673 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1674 dst+= stride;\
1675 src+= stride;\
1676 }\
1677 }else{\
1678 const int E= B+C;\
1679 const int step= C ? stride : 1;\
1680 for(i=0; i<h; i++){\
1681 OP(dst[0], (A*src[0] + E*src[step+0]));\
1682 OP(dst[1], (A*src[1] + E*src[step+1]));\
1683 OP(dst[2], (A*src[2] + E*src[step+2]));\
1684 OP(dst[3], (A*src[3] + E*src[step+3]));\
1685 dst+= stride;\
1686 src+= stride;\
1687 }\
1688 }\
1689 }\
1690 \
1691 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1692 const int A=(8-x)*(8-y);\
1693 const int B=( x)*(8-y);\
1694 const int C=(8-x)*( y);\
1695 const int D=( x)*( y);\
1696 int i;\
1697 \
1698 assert(x<8 && y<8 && x>=0 && y>=0);\
1699 \
1700 if(D){\
1701 for(i=0; i<h; i++){\
1702 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1703 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1704 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1705 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1706 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1707 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1708 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1709 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1710 dst+= stride;\
1711 src+= stride;\
1712 }\
1713 }else{\
1714 const int E= B+C;\
1715 const int step= C ? stride : 1;\
1716 for(i=0; i<h; i++){\
1717 OP(dst[0], (A*src[0] + E*src[step+0]));\
1718 OP(dst[1], (A*src[1] + E*src[step+1]));\
1719 OP(dst[2], (A*src[2] + E*src[step+2]));\
1720 OP(dst[3], (A*src[3] + E*src[step+3]));\
1721 OP(dst[4], (A*src[4] + E*src[step+4]));\
1722 OP(dst[5], (A*src[5] + E*src[step+5]));\
1723 OP(dst[6], (A*src[6] + E*src[step+6]));\
1724 OP(dst[7], (A*src[7] + E*src[step+7]));\
1725 dst+= stride;\
1726 src+= stride;\
1727 }\
1728 }\
1729 }
1730
1731 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1732 #define op_put(a, b) a = (((b) + 32)>>6)
1733
1734 H264_CHROMA_MC(put_ , op_put)
1735 H264_CHROMA_MC(avg_ , op_avg)
1736 #undef op_avg
1737 #undef op_put
1738
1739 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1740 const int A=(8-x)*(8-y);
1741 const int B=( x)*(8-y);
1742 const int C=(8-x)*( y);
1743 const int D=( x)*( y);
1744 int i;
1745
1746 assert(x<8 && y<8 && x>=0 && y>=0);
1747
1748 for(i=0; i<h; i++)
1749 {
1750 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1751 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1752 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1753 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1754 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1755 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1756 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1757 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1758 dst+= stride;
1759 src+= stride;
1760 }
1761 }
1762
1763 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1764 const int A=(8-x)*(8-y);
1765 const int B=( x)*(8-y);
1766 const int C=(8-x)*( y);
1767 const int D=( x)*( y);
1768 int i;
1769
1770 assert(x<8 && y<8 && x>=0 && y>=0);
1771
1772 for(i=0; i<h; i++)
1773 {
1774 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1775 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1776 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1777 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1778 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1779 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1780 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1781 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1782 dst+= stride;
1783 src+= stride;
1784 }
1785 }
1786
1787 #define QPEL_MC(r, OPNAME, RND, OP) \
1788 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1789 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1790 int i;\
1791 for(i=0; i<h; i++)\
1792 {\
1793 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1794 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1795 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1796 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1797 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1798 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1799 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1800 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1801 dst+=dstStride;\
1802 src+=srcStride;\
1803 }\
1804 }\
1805 \
1806 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1807 const int w=8;\
1808 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1809 int i;\
1810 for(i=0; i<w; i++)\
1811 {\
1812 const int src0= src[0*srcStride];\
1813 const int src1= src[1*srcStride];\
1814 const int src2= src[2*srcStride];\
1815 const int src3= src[3*srcStride];\
1816 const int src4= src[4*srcStride];\
1817 const int src5= src[5*srcStride];\
1818 const int src6= src[6*srcStride];\
1819 const int src7= src[7*srcStride];\
1820 const int src8= src[8*srcStride];\
1821 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1822 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1823 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1824 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1825 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1826 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1827 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1828 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1829 dst++;\
1830 src++;\
1831 }\
1832 }\
1833 \
1834 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1835 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1836 int i;\
1837 \
1838 for(i=0; i<h; i++)\
1839 {\
1840 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1841 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1842 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1843 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1844 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1845 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1846 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1847 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1848 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1849 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1850 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1851 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1852 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1853 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1854 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1855 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1856 dst+=dstStride;\
1857 src+=srcStride;\
1858 }\
1859 }\
1860 \
1861 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1862 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1863 int i;\
1864 const int w=16;\
1865 for(i=0; i<w; i++)\
1866 {\
1867 const int src0= src[0*srcStride];\
1868 const int src1= src[1*srcStride];\
1869 const int src2= src[2*srcStride];\
1870 const int src3= src[3*srcStride];\
1871 const int src4= src[4*srcStride];\
1872 const int src5= src[5*srcStride];\
1873 const int src6= src[6*srcStride];\
1874 const int src7= src[7*srcStride];\
1875 const int src8= src[8*srcStride];\
1876 const int src9= src[9*srcStride];\
1877 const int src10= src[10*srcStride];\
1878 const int src11= src[11*srcStride];\
1879 const int src12= src[12*srcStride];\
1880 const int src13= src[13*srcStride];\
1881 const int src14= src[14*srcStride];\
1882 const int src15= src[15*srcStride];\
1883 const int src16= src[16*srcStride];\
1884 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1885 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1886 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1887 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1888 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1889 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1890 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1891 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1892 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1893 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1894 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1895 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1896 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1897 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1898 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1899 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1900 dst++;\
1901 src++;\
1902 }\
1903 }\
1904 \
1905 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1906 OPNAME ## pixels8_c(dst, src, stride, 8);\
1907 }\
1908 \
1909 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1910 uint8_t half[64];\
1911 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1912 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1913 }\
1914 \
1915 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1916 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1917 }\
1918 \
1919 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1920 uint8_t half[64];\
1921 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1922 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1923 }\
1924 \
1925 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1926 uint8_t full[16*9];\
1927 uint8_t half[64];\
1928 copy_block9(full, src, 16, stride, 9);\
1929 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1930 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1931 }\
1932 \
1933 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1934 uint8_t full[16*9];\
1935 copy_block9(full, src, 16, stride, 9);\
1936 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1937 }\
1938 \
1939 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1940 uint8_t full[16*9];\
1941 uint8_t half[64];\
1942 copy_block9(full, src, 16, stride, 9);\
1943 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1944 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1945 }\
1946 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1947 uint8_t full[16*9];\
1948 uint8_t halfH[72];\
1949 uint8_t halfV[64];\
1950 uint8_t halfHV[64];\
1951 copy_block9(full, src, 16, stride, 9);\
1952 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1953 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1954 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1955 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1956 }\
1957 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1958 uint8_t full[16*9];\
1959 uint8_t halfH[72];\
1960 uint8_t halfHV[64];\
1961 copy_block9(full, src, 16, stride, 9);\
1962 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1963 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1964 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1966 }\
1967 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1968 uint8_t full[16*9];\
1969 uint8_t halfH[72];\
1970 uint8_t halfV[64];\
1971 uint8_t halfHV[64];\
1972 copy_block9(full, src, 16, stride, 9);\
1973 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1974 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1975 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1976 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1977 }\
1978 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1979 uint8_t full[16*9];\
1980 uint8_t halfH[72];\
1981 uint8_t halfHV[64];\
1982 copy_block9(full, src, 16, stride, 9);\
1983 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1984 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1985 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1986 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1987 }\
1988 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1989 uint8_t full[16*9];\
1990 uint8_t halfH[72];\
1991 uint8_t halfV[64];\
1992 uint8_t halfHV[64];\
1993 copy_block9(full, src, 16, stride, 9);\
1994 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1995 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1996 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1997 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1998 }\
1999 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2000 uint8_t full[16*9];\
2001 uint8_t halfH[72];\
2002 uint8_t halfHV[64];\
2003 copy_block9(full, src, 16, stride, 9);\
2004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2005 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2006 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2007 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2008 }\
2009 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2010 uint8_t full[16*9];\
2011 uint8_t halfH[72];\
2012 uint8_t halfV[64];\
2013 uint8_t halfHV[64];\
2014 copy_block9(full, src, 16, stride, 9);\
2015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
2016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2018 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2019 }\
2020 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2021 uint8_t full[16*9];\
2022 uint8_t halfH[72];\
2023 uint8_t halfHV[64];\
2024 copy_block9(full, src, 16, stride, 9);\
2025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2026 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2027 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2028 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2029 }\
2030 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2031 uint8_t halfH[72];\
2032 uint8_t halfHV[64];\
2033 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2034 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2035 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
2036 }\
2037 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2038 uint8_t halfH[72];\
2039 uint8_t halfHV[64];\
2040 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2041 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2042 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2043 }\
2044 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2045 uint8_t full[16*9];\
2046 uint8_t halfH[72];\
2047 uint8_t halfV[64];\
2048 uint8_t halfHV[64];\
2049 copy_block9(full, src, 16, stride, 9);\
2050 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2051 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2052 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2053 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2054 }\
2055 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2056 uint8_t full[16*9];\
2057 uint8_t halfH[72];\
2058 copy_block9(full, src, 16, stride, 9);\
2059 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2060 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2061 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2062 }\
2063 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2064 uint8_t full[16*9];\
2065 uint8_t halfH[72];\
2066 uint8_t halfV[64];\
2067 uint8_t halfHV[64];\
2068 copy_block9(full, src, 16, stride, 9);\
2069 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2070 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2071 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2072 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2073 }\
2074 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2075 uint8_t full[16*9];\
2076 uint8_t halfH[72];\
2077 copy_block9(full, src, 16, stride, 9);\
2078 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2079 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2080 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2081 }\
2082 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2083 uint8_t halfH[72];\
2084 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2085 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2086 }\
2087 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2088 OPNAME ## pixels16_c(dst, src, stride, 16);\
2089 }\
2090 \
2091 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2092 uint8_t half[256];\
2093 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2094 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2095 }\
2096 \
2097 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2098 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2099 }\
2100 \
2101 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2102 uint8_t half[256];\
2103 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2104 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2105 }\
2106 \
2107 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2108 uint8_t full[24*17];\
2109 uint8_t half[256];\
2110 copy_block17(full, src, 24, stride, 17);\
2111 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2112 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2113 }\
2114 \
2115 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2116 uint8_t full[24*17];\
2117 copy_block17(full, src, 24, stride, 17);\
2118 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2119 }\
2120 \
2121 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2122 uint8_t full[24*17];\
2123 uint8_t half[256];\
2124 copy_block17(full, src, 24, stride, 17);\
2125 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2126 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2127 }\
2128 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2129 uint8_t full[24*17];\
2130 uint8_t halfH[272];\
2131 uint8_t halfV[256];\
2132 uint8_t halfHV[256];\
2133 copy_block17(full, src, 24, stride, 17);\
2134 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2135 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2136 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2137 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2138 }\
2139 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2140 uint8_t full[24*17];\
2141 uint8_t halfH[272];\
2142 uint8_t halfHV[256];\
2143 copy_block17(full, src, 24, stride, 17);\
2144 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2145 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2146 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2148 }\
2149 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2150 uint8_t full[24*17];\
2151 uint8_t halfH[272];\
2152 uint8_t halfV[256];\
2153 uint8_t halfHV[256];\
2154 copy_block17(full, src, 24, stride, 17);\
2155 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2156 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2157 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2158 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2159 }\
2160 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2161 uint8_t full[24*17];\
2162 uint8_t halfH[272];\
2163 uint8_t halfHV[256];\
2164 copy_block17(full, src, 24, stride, 17);\
2165 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2166 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2167 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2168 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2169 }\
2170 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2171 uint8_t full[24*17];\
2172 uint8_t halfH[272];\
2173 uint8_t halfV[256];\
2174 uint8_t halfHV[256];\
2175 copy_block17(full, src, 24, stride, 17);\
2176 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2177 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2178 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2179 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2180 }\
2181 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2182 uint8_t full[24*17];\
2183 uint8_t halfH[272];\
2184 uint8_t halfHV[256];\
2185 copy_block17(full, src, 24, stride, 17);\
2186 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2187 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2188 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2189 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2190 }\
2191 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2192 uint8_t full[24*17];\
2193 uint8_t halfH[272];\
2194 uint8_t halfV[256];\
2195 uint8_t halfHV[256];\
2196 copy_block17(full, src, 24, stride, 17);\
2197 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2198 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2199 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2200 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2201 }\
2202 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2203 uint8_t full[24*17];\
2204 uint8_t halfH[272];\
2205 uint8_t halfHV[256];\
2206 copy_block17(full, src, 24, stride, 17);\
2207 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2208 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2209 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2210 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2211 }\
2212 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2213 uint8_t halfH[272];\
2214 uint8_t halfHV[256];\
2215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2216 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2217 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2218 }\
2219 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2220 uint8_t halfH[272];\
2221 uint8_t halfHV[256];\
2222 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2223 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2224 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2225 }\
2226 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2227 uint8_t full[24*17];\
2228 uint8_t halfH[272];\
2229 uint8_t halfV[256];\
2230 uint8_t halfHV[256];\
2231 copy_block17(full, src, 24, stride, 17);\
2232 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2233 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2234 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2235 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2236 }\
2237 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2238 uint8_t full[24*17];\
2239 uint8_t halfH[272];\
2240 copy_block17(full, src, 24, stride, 17);\
2241 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2242 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2243 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2244 }\
2245 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2246 uint8_t full[24*17];\
2247 uint8_t halfH[272];\
2248 uint8_t halfV[256];\
2249 uint8_t halfHV[256];\
2250 copy_block17(full, src, 24, stride, 17);\
2251 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2252 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2253 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2254 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2255 }\
2256 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2257 uint8_t full[24*17];\
2258 uint8_t halfH[272];\
2259 copy_block17(full, src, 24, stride, 17);\
2260 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2261 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2262 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2263 }\
2264 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2265 uint8_t halfH[272];\
2266 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2267 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2268 }
2269
2270 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2271 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2272 #define op_put(a, b) a = cm[((b) + 16)>>5]
2273 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2274
2275 QPEL_MC(0, put_ , _ , op_put)
2276 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2277 QPEL_MC(0, avg_ , _ , op_avg)
2278 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2279 #undef op_avg
2280 #undef op_avg_no_rnd
2281 #undef op_put
2282 #undef op_put_no_rnd
2283
2284 #if 1
2285 #define H264_LOWPASS(OPNAME, OP, OP2) \
2286 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2287 const int h=2;\
2288 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2289 int i;\
2290 for(i=0; i<h; i++)\
2291 {\
2292 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2293 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2294 dst+=dstStride;\
2295 src+=srcStride;\
2296 }\
2297 }\
2298 \
2299 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2300 const int w=2;\
2301 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2302 int i;\
2303 for(i=0; i<w; i++)\
2304 {\
2305 const int srcB= src[-2*srcStride];\
2306 const int srcA= src[-1*srcStride];\
2307 const int src0= src[0 *srcStride];\
2308 const int src1= src[1 *srcStride];\
2309 const int src2= src[2 *srcStride];\
2310 const int src3= src[3 *srcStride];\
2311 const int src4= src[4 *srcStride];\
2312 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2313 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2314 dst++;\
2315 src++;\
2316 }\
2317 }\
2318 \
2319 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2320 const int h=2;\
2321 const int w=2;\
2322 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2323 int i;\
2324 src -= 2*srcStride;\
2325 for(i=0; i<h+5; i++)\
2326 {\
2327 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2328 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2329 tmp+=tmpStride;\
2330 src+=srcStride;\
2331 }\
2332 tmp -= tmpStride*(h+5-2);\
2333 for(i=0; i<w; i++)\
2334 {\
2335 const int tmpB= tmp[-2*tmpStride];\
2336 const int tmpA= tmp[-1*tmpStride];\
2337 const int tmp0= tmp[0 *tmpStride];\
2338 const int tmp1= tmp[1 *tmpStride];\
2339 const int tmp2= tmp[2 *tmpStride];\
2340 const int tmp3= tmp[3 *tmpStride];\
2341 const int tmp4= tmp[4 *tmpStride];\
2342 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2343 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2344 dst++;\
2345 tmp++;\
2346 }\
2347 }\
2348 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2349 const int h=4;\
2350 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2351 int i;\
2352 for(i=0; i<h; i++)\
2353 {\
2354 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2355 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2356 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2357 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2358 dst+=dstStride;\
2359 src+=srcStride;\
2360 }\
2361 }\
2362 \
2363 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2364 const int w=4;\
2365 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2366 int i;\
2367 for(i=0; i<w; i++)\
2368 {\
2369 const int srcB= src[-2*srcStride];\
2370 const int srcA= src[-1*srcStride];\
2371 const int src0= src[0 *srcStride];\
2372 const int src1= src[1 *srcStride];\
2373 const int src2= src[2 *srcStride];\
2374 const int src3= src[3 *srcStride];\
2375 const int src4= src[4 *srcStride];\
2376 const int src5= src[5 *srcStride];\
2377 const int src6= src[6 *srcStride];\
2378 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2379 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2380 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2381 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2382 dst++;\
2383 src++;\
2384 }\
2385 }\
2386 \
2387 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2388 const int h=4;\
2389 const int w=4;\
2390 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2391 int i;\
2392 src -= 2*srcStride;\
2393 for(i=0; i<h+5; i++)\
2394 {\
2395 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2396 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2397 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2398 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2399 tmp+=tmpStride;\
2400 src+=srcStride;\
2401 }\
2402 tmp -= tmpStride*(h+5-2);\
2403 for(i=0; i<w; i++)\
2404 {\
2405 const int tmpB= tmp[-2*tmpStride];\
2406 const int tmpA= tmp[-1*tmpStride];\
2407 const int tmp0= tmp[0 *tmpStride];\
2408 const int tmp1= tmp[1 *tmpStride];\
2409 const int tmp2= tmp[2 *tmpStride];\
2410 const int tmp3= tmp[3 *tmpStride];\
2411 const int tmp4= tmp[4 *tmpStride];\
2412 const int tmp5= tmp[5 *tmpStride];\
2413 const int tmp6= tmp[6 *tmpStride];\
2414 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2415 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2416 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2417 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2418 dst++;\
2419 tmp++;\
2420 }\
2421 }\
2422 \
2423 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2424 const int h=8;\
2425 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2426 int i;\
2427 for(i=0; i<h; i++)\
2428 {\
2429 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2430 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2431 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2432 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2433 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2434 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2435 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2436 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2437 dst+=dstStride;\
2438 src+=srcStride;\
2439 }\
2440 }\
2441 \
2442 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2443 const int w=8;\
2444 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2445 int i;\
2446 for(i=0; i<w; i++)\
2447 {\
2448 const int srcB= src[-2*srcStride];\
2449 const int srcA= src[-1*srcStride];\
2450 const int src0= src[0 *srcStride];\
2451 const int src1= src[1 *srcStride];\
2452 const int src2= src[2 *srcStride];\
2453 const int src3= src[3 *srcStride];\
2454 const int src4= src[4 *srcStride];\
2455 const int src5= src[5 *srcStride];\
2456 const int src6= src[6 *srcStride];\
2457 const int src7= src[7 *srcStride];\
2458 const int src8= src[8 *srcStride];\
2459 const int src9= src[9 *srcStride];\
2460 const int src10=src[10*srcStride];\
2461 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2462 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2463 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2464 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2465 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2466 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2467 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2468 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2469 dst++;\
2470 src++;\
2471 }\
2472 }\
2473 \
2474 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2475 const int h=8;\
2476 const int w=8;\
2477 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2478 int i;\
2479 src -= 2*srcStride;\
2480 for(i=0; i<h+5; i++)\
2481 {\
2482 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2483 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2484 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2485 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2486 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2487 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2488 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2489 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2490 tmp+=tmpStride;\
2491 src+=srcStride;\
2492 }\
2493 tmp -= tmpStride*(h+5-2);\
2494 for(i=0; i<w; i++)\
2495 {\
2496 const int tmpB= tmp[-2*tmpStride];\
2497 const int tmpA= tmp[-1*tmpStride];\
2498 const int tmp0= tmp[0 *tmpStride];\
2499 const int tmp1= tmp[1 *tmpStride];\
2500 const int tmp2= tmp[2 *tmpStride];\
2501 const int tmp3= tmp[3 *tmpStride];\
2502 const int tmp4= tmp[4 *tmpStride];\
2503 const int tmp5= tmp[5 *tmpStride];\
2504 const int tmp6= tmp[6 *tmpStride];\
2505 const int tmp7= tmp[7 *tmpStride];\
2506 const int tmp8= tmp[8 *tmpStride];\
2507 const int tmp9= tmp[9 *tmpStride];\
2508 const int tmp10=tmp[10*tmpStride];\
2509 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2510 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2511 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2512 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2513 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2514 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2515 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2516 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2517 dst++;\
2518 tmp++;\
2519 }\
2520 }\
2521 \
2522 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2523 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2524 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2525 src += 8*srcStride;\
2526 dst += 8*dstStride;\
2527 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2528 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2529 }\
2530 \
2531 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2532 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2533 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2534 src += 8*srcStride;\
2535 dst += 8*dstStride;\
2536 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2537 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2538 }\
2539 \
2540 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2541 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2542 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2543 src += 8*srcStride;\
2544 dst += 8*dstStride;\
2545 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2546 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2547 }\
2548
2549 #define H264_MC(OPNAME, SIZE) \
2550 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2551 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2552 }\
2553 \
2554 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2555 uint8_t half[SIZE*SIZE];\
2556 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2557 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2558 }\
2559 \
2560 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2561 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2562 }\
2563 \
2564 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2565 uint8_t half[SIZE*SIZE];\
2566 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2567 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2568 }\
2569 \
2570 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2571 uint8_t full[SIZE*(SIZE+5)];\
2572 uint8_t * const full_mid= full + SIZE*2;\
2573 uint8_t half[SIZE*SIZE];\
2574 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2575 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2576 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2577 }\
2578 \
2579 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2580 uint8_t full[SIZE*(SIZE+5)];\
2581 uint8_t * const full_mid= full + SIZE*2;\
2582 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2583 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2584 }\
2585 \
2586 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2587 uint8_t full[SIZE*(SIZE+5)];\
2588 uint8_t * const full_mid= full + SIZE*2;\
2589 uint8_t half[SIZE*SIZE];\
2590 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2591 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2592 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2593 }\
2594 \
2595 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2596 uint8_t full[SIZE*(SIZE+5)];\
2597 uint8_t * const full_mid= full + SIZE*2;\
2598 uint8_t halfH[SIZE*SIZE];\
2599 uint8_t halfV[SIZE*SIZE];\
2600 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2601 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2602 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2603 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2604 }\
2605 \
2606 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2607 uint8_t full[SIZE*(SIZE+5)];\
2608 uint8_t * const full_mid= full + SIZE*2;\
2609 uint8_t halfH[SIZE*SIZE];\
2610 uint8_t halfV[SIZE*SIZE];\
2611 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2612 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2613 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2614 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2615 }\
2616 \
2617 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2618 uint8_t full[SIZE*(SIZE+5)];\
2619 uint8_t * const full_mid= full + SIZE*2;\
2620 uint8_t halfH[SIZE*SIZE];\
2621 uint8_t halfV[SIZE*SIZE];\
2622 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2623 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2624 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2625 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2626 }\
2627 \
2628 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2629 uint8_t full[SIZE*(SIZE+5)];\
2630 uint8_t * const full_mid= full + SIZE*2;\
2631 uint8_t halfH[SIZE*SIZE];\
2632 uint8_t halfV[SIZE*SIZE];\
2633 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2634 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2635 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2636 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2637 }\
2638 \
2639 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2640 int16_t tmp[SIZE*(SIZE+5)];\
2641 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2642 }\
2643 \
2644 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2645 int16_t tmp[SIZE*(SIZE+5)];\
2646 uint8_t halfH[SIZE*SIZE];\
2647 uint8_t halfHV[SIZE*SIZE];\
2648 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2649 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2650 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2651 }\
2652 \
2653 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2654 int16_t tmp[SIZE*(SIZE+5)];\
2655 uint8_t halfH[SIZE*SIZE];\
2656 uint8_t halfHV[SIZE*SIZE];\
2657 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2658 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2659 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2660 }\
2661 \
2662 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2663 uint8_t full[SIZE*(SIZE+5)];\
2664 uint8_t * const full_mid= full + SIZE*2;\
2665 int16_t tmp[SIZE*(SIZE+5)];\
2666 uint8_t halfV[SIZE*SIZE];\
2667 uint8_t halfHV[SIZE*SIZE];\
2668 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2669 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2670 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2671 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2672 }\
2673 \
2674 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2675 uint8_t full[SIZE*(SIZE+5)];\
2676 uint8_t * const full_mid= full + SIZE*2;\
2677 int16_t tmp[SIZE*(SIZE+5)];\
2678 uint8_t halfV[SIZE*SIZE];\
2679 uint8_t halfHV[SIZE*SIZE];\
2680 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2681 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2682 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2683 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2684 }\
2685
2686 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2687 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2688 #define op_put(a, b) a = cm[((b) + 16)>>5]
2689 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2690 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2691
2692 H264_LOWPASS(put_ , op_put, op2_put)
2693 H264_LOWPASS(avg_ , op_avg, op2_avg)
2694 H264_MC(put_, 2)
2695 H264_MC(put_, 4)
2696 H264_MC(put_, 8)
2697 H264_MC(put_, 16)
2698 H264_MC(avg_, 4)
2699 H264_MC(avg_, 8)
2700 H264_MC(avg_, 16)
2701
2702 #undef op_avg
2703 #undef op_put
2704 #undef op2_avg
2705 #undef op2_put
2706 #endif
2707
2708 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2709 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2710 #define H264_WEIGHT(W,H) \
2711 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2712 int y; \
2713 offset <<= log2_denom; \
2714 if(log2_denom) offset += 1<<(log2_denom-1); \
2715 for(y=0; y<H; y++, block += stride){ \
2716 op_scale1(0); \
2717 op_scale1(1); \
2718 if(W==2) continue; \
2719 op_scale1(2); \
2720 op_scale1(3); \
2721 if(W==4) continue; \
2722 op_scale1(4); \
2723 op_scale1(5); \
2724 op_scale1(6); \
2725 op_scale1(7); \
2726 if(W==8) continue; \
2727 op_scale1(8); \
2728 op_scale1(9); \
2729 op_scale1(10); \
2730 op_scale1(11); \
2731 op_scale1(12); \
2732 op_scale1(13); \
2733 op_scale1(14); \
2734 op_scale1(15); \
2735 } \
2736 } \
2737 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2738 int y; \
2739 offset = ((offset + 1) | 1) << log2_denom; \
2740 for(y=0; y<H; y++, dst += stride, src += stride){ \
2741 op_scale2(0); \
2742 op_scale2(1); \
2743 if(W==2) continue; \
2744 op_scale2(2); \
2745 op_scale2(3); \
2746 if(W==4) continue; \
2747 op_scale2(4); \
2748 op_scale2(5); \
2749 op_scale2(6); \
2750 op_scale2(7); \
2751 if(W==8) continue; \
2752 op_scale2(8); \
2753 op_scale2(9); \
2754 op_scale2(10); \
2755 op_scale2(11); \
2756 op_scale2(12); \
2757 op_scale2(13); \
2758 op_scale2(14); \
2759 op_scale2(15); \
2760 } \
2761 }
2762
2763 H264_WEIGHT(16,16)
2764 H264_WEIGHT(16,8)
2765 H264_WEIGHT(8,16)
2766 H264_WEIGHT(8,8)
2767 H264_WEIGHT(8,4)
2768 H264_WEIGHT(4,8)
2769 H264_WEIGHT(4,4)
2770 H264_WEIGHT(4,2)
2771 H264_WEIGHT(2,4)
2772 H264_WEIGHT(2,2)
2773
2774 #undef op_scale1
2775 #undef op_scale2
2776 #undef H264_WEIGHT
2777
2778 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2779 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2780 int i;
2781
2782 for(i=0; i<h; i++){
2783 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2784 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2785 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2786 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2787 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2788 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2789 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2790 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2791 dst+=dstStride;
2792 src+=srcStride;
2793 }
2794 }
2795
2796 #if CONFIG_CAVS_DECODER
2797 /* AVS specific */
2798 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2799 put_pixels8_c(dst, src, stride, 8);
2800 }
2801 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2802 avg_pixels8_c(dst, src, stride, 8);
2803 }
2804 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2805 put_pixels16_c(dst, src, stride, 16);
2806 }
2807 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2808 avg_pixels16_c(dst, src, stride, 16);
2809 }
2810 #endif /* CONFIG_CAVS_DECODER */
2811
2812 #if CONFIG_VC1_DECODER
2813 /* VC-1 specific */
2814 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2815 put_pixels8_c(dst, src, stride, 8);
2816 }
2817 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2818 avg_pixels8_c(dst, src, stride, 8);
2819 }
2820 #endif /* CONFIG_VC1_DECODER */
2821
2822 /* H264 specific */
2823 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2824
2825 #if CONFIG_RV40_DECODER
2826 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2827 put_pixels16_xy2_c(dst, src, stride, 16);
2828 }
2829 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2830 avg_pixels16_xy2_c(dst, src, stride, 16);
2831 }
2832 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2833 put_pixels8_xy2_c(dst, src, stride, 8);
2834 }
2835 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2836 avg_pixels8_xy2_c(dst, src, stride, 8);
2837 }
2838 #endif /* CONFIG_RV40_DECODER */
2839
2840 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2841 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2842 int i;
2843
2844 for(i=0; i<w; i++){
2845 const int src_1= src[ -srcStride];
2846 const int src0 = src[0 ];
2847 const int src1 = src[ srcStride];
2848 const int src2 = src[2*srcStride];
2849 const int src3 = src[3*srcStride];
2850 const int src4 = src[4*srcStride];
2851 const int src5 = src[5*srcStride];
2852 const int src6 = src[6*srcStride];
2853 const int src7 = src[7*srcStride];
2854 const int src8 = src[8*srcStride];
2855 const int src9 = src[9*srcStride];
2856 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2857 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2858 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2859 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2860 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2861 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2862 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2863 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2864 src++;
2865 dst++;
2866 }
2867 }
2868
2869 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2870 put_pixels8_c(dst, src, stride, 8);
2871 }
2872
2873 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2874 uint8_t half[64];
2875 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2876 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2877 }
2878
2879 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2880 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2881 }
2882
2883 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2884 uint8_t half[64];
2885 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2886 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2887 }
2888
2889 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2890 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2891 }
2892
2893 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2894 uint8_t halfH[88];
2895 uint8_t halfV[64];
2896 uint8_t halfHV[64];
2897 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2898 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2899 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2900 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2901 }
2902 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2903 uint8_t halfH[88];
2904 uint8_t halfV[64];
2905 uint8_t halfHV[64];
2906 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2907 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2908 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2909 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2910 }
2911 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2912 uint8_t halfH[88];
2913 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2914 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2915 }
2916
2917 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2918 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2919 int x;
2920 const int strength= ff_h263_loop_filter_strength[qscale];
2921
2922 for(x=0; x<8; x++){
2923 int d1, d2, ad1;
2924 int p0= src[x-2*stride];
2925 int p1= src[x-1*stride];
2926 int p2= src[x+0*stride];
2927 int p3= src[x+1*stride];
2928 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2929
2930 if (d<-2*strength) d1= 0;
2931 else if(d<- strength) d1=-2*strength - d;
2932 else if(d< strength) d1= d;
2933 else if(d< 2*strength) d1= 2*strength - d;
2934 else d1= 0;
2935
2936 p1 += d1;
2937 p2 -= d1;
2938 if(p1&256) p1= ~(p1>>31);
2939 if(p2&256) p2= ~(p2>>31);
2940
2941 src[x-1*stride] = p1;
2942 src[x+0*stride] = p2;
2943
2944 ad1= FFABS(d1)>>1;
2945
2946 d2= av_clip((p0-p3)/4, -ad1, ad1);
2947
2948 src[x-2*stride] = p0 - d2;
2949 src[x+ stride] = p3 + d2;
2950 }
2951 }
2952 }
2953
2954 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2955 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2956 int y;
2957 const int strength= ff_h263_loop_filter_strength[qscale];
2958
2959 for(y=0; y<8; y++){
2960 int d1, d2, ad1;
2961 int p0= src[y*stride-2];
2962 int p1= src[y*stride-1];
2963 int p2= src[y*stride+0];
2964 int p3= src[y*stride+1];
2965 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2966
2967 if (d<-2*strength) d1= 0;
2968 else if(d<- strength) d1=-2*strength - d;
2969 else if(d< strength) d1= d;
2970 else if(d< 2*strength) d1= 2*strength - d;
2971 else d1= 0;
2972
2973 p1 += d1;
2974 p2 -= d1;
2975 if(p1&256) p1= ~(p1>>31);
2976 if(p2&256) p2= ~(p2>>31);
2977
2978 src[y*stride-1] = p1;
2979 src[y*stride+0] = p2;
2980
2981 ad1= FFABS(d1)>>1;
2982
2983 d2= av_clip((p0-p3)/4, -ad1, ad1);
2984
2985 src[y*stride-2] = p0 - d2;
2986 src[y*stride+1] = p3 + d2;
2987 }
2988 }
2989 }
2990
2991 static void h261_loop_filter_c(uint8_t *src, int stride){
2992 int x,y,xy,yz;
2993 int temp[64];
2994
2995 for(x=0; x<8; x++){
2996 temp[x ] = 4*src[x ];
2997 temp[x + 7*8] = 4*src[x + 7*stride];
2998 }
2999 for(y=1; y<7; y++){
3000 for(x=0; x<8; x++){
3001 xy = y * stride + x;
3002 yz = y * 8 + x;
3003 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
3004 }
3005 }
3006
3007 for(y=0; y<8; y++){
3008 src[ y*stride] = (temp[ y*8] + 2)>>2;
3009 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
3010 for(x=1; x<7; x++){
3011 xy = y * stride + x;
3012 yz = y * 8 + x;
3013 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
3014 }
3015 }
3016 }
3017
3018 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3019 {
3020 int i, d;
3021 for( i = 0; i < 4; i++ ) {
3022 if( tc0[i] < 0 ) {
3023 pix += 4*ystride;
3024 continue;
3025 }
3026 for( d = 0; d < 4; d++ ) {
3027 const int p0 = pix[-1*xstride];
3028 const int p1 = pix[-2*xstride];
3029 const int p2 = pix[-3*xstride];
3030 const int q0 = pix[0];
3031 const int q1 = pix[1*xstride];
3032 const int q2 = pix[2*xstride];
3033
3034 if( FFABS( p0 - q0 ) < alpha &&
3035 FFABS( p1 - p0 ) < beta &&
3036 FFABS( q1 - q0 ) < beta ) {
3037
3038 int tc = tc0[i];
3039 int i_delta;
3040
3041 if( FFABS( p2 - p0 ) < beta ) {
3042 if(tc0[i])
3043 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3044 tc++;
3045 }
3046 if( FFABS( q2 - q0 ) < beta ) {
3047 if(tc0[i])
3048 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3049 tc++;
3050 }
3051
3052 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3053 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
3054 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
3055 }
3056 pix += ystride;
3057 }
3058 }
3059 }
3060 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3061 {
3062 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3063 }
3064 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3065 {
3066 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3067 }
3068
3069 static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3070 {
3071 int d;
3072 for( d = 0; d < 16; d++ ) {
3073 const int p2 = pix[-3*xstride];
3074 const int p1 = pix[-2*xstride];
3075 const int p0 = pix[-1*xstride];
3076
3077 const int q0 = pix[ 0*xstride];
3078 const int q1 = pix[ 1*xstride];
3079 const int q2 = pix[ 2*xstride];
3080
3081 if( FFABS( p0 - q0 ) < alpha &&
3082 FFABS( p1 - p0 ) < beta &&
3083 FFABS( q1 - q0 ) < beta ) {
3084
3085 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3086 if( FFABS( p2 - p0 ) < beta)
3087 {
3088 const int p3 = pix[-4*xstride];
3089 /* p0', p1', p2' */
3090 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3091 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3092 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3093 } else {
3094 /* p0' */
3095 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3096 }
3097 if( FFABS( q2 - q0 ) < beta)
3098 {
3099 const int q3 = pix[3*xstride];
3100 /* q0', q1', q2' */
3101 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >