8513e86329d4c88be4bfc601010a23d42f4a7914
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /**
26 * @file libavcodec/dsputil.c
27 * DSP utils
28 */
29
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "mathops.h"
36 #include "snow.h"
37 #include "mpegvideo.h"
38 #include "config.h"
39 #include "lpc.h"
40 #include "ac3dec.h"
41 #include "vorbis.h"
42 #include "png.h"
43
44 /* snow.c */
45 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
46
47 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
48 uint32_t ff_squareTbl[512] = {0, };
49
50 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
51 #define pb_7f (~0UL/255 * 0x7f)
52 #define pb_80 (~0UL/255 * 0x80)
53
54 const uint8_t ff_zigzag_direct[64] = {
55 0, 1, 8, 16, 9, 2, 3, 10,
56 17, 24, 32, 25, 18, 11, 4, 5,
57 12, 19, 26, 33, 40, 48, 41, 34,
58 27, 20, 13, 6, 7, 14, 21, 28,
59 35, 42, 49, 56, 57, 50, 43, 36,
60 29, 22, 15, 23, 30, 37, 44, 51,
61 58, 59, 52, 45, 38, 31, 39, 46,
62 53, 60, 61, 54, 47, 55, 62, 63
63 };
64
65 /* Specific zigzag scan for 248 idct. NOTE that unlike the
66 specification, we interleave the fields */
67 const uint8_t ff_zigzag248_direct[64] = {
68 0, 8, 1, 9, 16, 24, 2, 10,
69 17, 25, 32, 40, 48, 56, 33, 41,
70 18, 26, 3, 11, 4, 12, 19, 27,
71 34, 42, 49, 57, 50, 58, 35, 43,
72 20, 28, 5, 13, 6, 14, 21, 29,
73 36, 44, 51, 59, 52, 60, 37, 45,
74 22, 30, 7, 15, 23, 31, 38, 46,
75 53, 61, 54, 62, 39, 47, 55, 63,
76 };
77
78 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
79 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
80
81 const uint8_t ff_alternate_horizontal_scan[64] = {
82 0, 1, 2, 3, 8, 9, 16, 17,
83 10, 11, 4, 5, 6, 7, 15, 14,
84 13, 12, 19, 18, 24, 25, 32, 33,
85 26, 27, 20, 21, 22, 23, 28, 29,
86 30, 31, 34, 35, 40, 41, 48, 49,
87 42, 43, 36, 37, 38, 39, 44, 45,
88 46, 47, 50, 51, 56, 57, 58, 59,
89 52, 53, 54, 55, 60, 61, 62, 63,
90 };
91
92 const uint8_t ff_alternate_vertical_scan[64] = {
93 0, 8, 16, 24, 1, 9, 2, 10,
94 17, 25, 32, 40, 48, 56, 57, 49,
95 41, 33, 26, 18, 3, 11, 4, 12,
96 19, 27, 34, 42, 50, 58, 35, 43,
97 51, 59, 20, 28, 5, 13, 6, 14,
98 21, 29, 36, 44, 52, 60, 37, 45,
99 53, 61, 22, 30, 7, 15, 23, 31,
100 38, 46, 54, 62, 39, 47, 55, 63,
101 };
102
103 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
104 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
105 const uint32_t ff_inverse[257]={
106 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
107 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
108 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
109 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
110 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
111 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
112 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
113 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
114 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
115 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
116 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
117 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
118 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
119 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
120 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
121 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
122 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
123 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
124 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
125 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
126 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
127 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
128 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
129 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
130 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
131 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
132 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
133 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
134 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
135 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
136 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
137 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
138 16777216
139 };
140
141 /* Input permutation for the simple_idct_mmx */
142 static const uint8_t simple_mmx_permutation[64]={
143 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
144 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
145 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
146 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
147 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
148 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
149 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
150 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
151 };
152
153 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
154
155 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
156 int i;
157 int end;
158
159 st->scantable= src_scantable;
160
161 for(i=0; i<64; i++){
162 int j;
163 j = src_scantable[i];
164 st->permutated[i] = permutation[j];
165 #if ARCH_PPC
166 st->inverse[j] = i;
167 #endif
168 }
169
170 end=-1;
171 for(i=0; i<64; i++){
172 int j;
173 j = st->permutated[i];
174 if(j>end) end=j;
175 st->raster_end[i]= end;
176 }
177 }
178
179 static int pix_sum_c(uint8_t * pix, int line_size)
180 {
181 int s, i, j;
182
183 s = 0;
184 for (i = 0; i < 16; i++) {
185 for (j = 0; j < 16; j += 8) {
186 s += pix[0];
187 s += pix[1];
188 s += pix[2];
189 s += pix[3];
190 s += pix[4];
191 s += pix[5];
192 s += pix[6];
193 s += pix[7];
194 pix += 8;
195 }
196 pix += line_size - 16;
197 }
198 return s;
199 }
200
201 static int pix_norm1_c(uint8_t * pix, int line_size)
202 {
203 int s, i, j;
204 uint32_t *sq = ff_squareTbl + 256;
205
206 s = 0;
207 for (i = 0; i < 16; i++) {
208 for (j = 0; j < 16; j += 8) {
209 #if 0
210 s += sq[pix[0]];
211 s += sq[pix[1]];
212 s += sq[pix[2]];
213 s += sq[pix[3]];
214 s += sq[pix[4]];
215 s += sq[pix[5]];
216 s += sq[pix[6]];
217 s += sq[pix[7]];
218 #else
219 #if LONG_MAX > 2147483647
220 register uint64_t x=*(uint64_t*)pix;
221 s += sq[x&0xff];
222 s += sq[(x>>8)&0xff];
223 s += sq[(x>>16)&0xff];
224 s += sq[(x>>24)&0xff];
225 s += sq[(x>>32)&0xff];
226 s += sq[(x>>40)&0xff];
227 s += sq[(x>>48)&0xff];
228 s += sq[(x>>56)&0xff];
229 #else
230 register uint32_t x=*(uint32_t*)pix;
231 s += sq[x&0xff];
232 s += sq[(x>>8)&0xff];
233 s += sq[(x>>16)&0xff];
234 s += sq[(x>>24)&0xff];
235 x=*(uint32_t*)(pix+4);
236 s += sq[x&0xff];
237 s += sq[(x>>8)&0xff];
238 s += sq[(x>>16)&0xff];
239 s += sq[(x>>24)&0xff];
240 #endif
241 #endif
242 pix += 8;
243 }
244 pix += line_size - 16;
245 }
246 return s;
247 }
248
249 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
250 int i;
251
252 for(i=0; i+8<=w; i+=8){
253 dst[i+0]= bswap_32(src[i+0]);
254 dst[i+1]= bswap_32(src[i+1]);
255 dst[i+2]= bswap_32(src[i+2]);
256 dst[i+3]= bswap_32(src[i+3]);
257 dst[i+4]= bswap_32(src[i+4]);
258 dst[i+5]= bswap_32(src[i+5]);
259 dst[i+6]= bswap_32(src[i+6]);
260 dst[i+7]= bswap_32(src[i+7]);
261 }
262 for(;i<w; i++){
263 dst[i+0]= bswap_32(src[i+0]);
264 }
265 }
266
267 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
268 {
269 int s, i;
270 uint32_t *sq = ff_squareTbl + 256;
271
272 s = 0;
273 for (i = 0; i < h; i++) {
274 s += sq[pix1[0] - pix2[0]];
275 s += sq[pix1[1] - pix2[1]];
276 s += sq[pix1[2] - pix2[2]];
277 s += sq[pix1[3] - pix2[3]];
278 pix1 += line_size;
279 pix2 += line_size;
280 }
281 return s;
282 }
283
284 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
285 {
286 int s, i;
287 uint32_t *sq = ff_squareTbl + 256;
288
289 s = 0;
290 for (i = 0; i < h; i++) {
291 s += sq[pix1[0] - pix2[0]];
292 s += sq[pix1[1] - pix2[1]];
293 s += sq[pix1[2] - pix2[2]];
294 s += sq[pix1[3] - pix2[3]];
295 s += sq[pix1[4] - pix2[4]];
296 s += sq[pix1[5] - pix2[5]];
297 s += sq[pix1[6] - pix2[6]];
298 s += sq[pix1[7] - pix2[7]];
299 pix1 += line_size;
300 pix2 += line_size;
301 }
302 return s;
303 }
304
305 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
306 {
307 int s, i;
308 uint32_t *sq = ff_squareTbl + 256;
309
310 s = 0;
311 for (i = 0; i < h; i++) {
312 s += sq[pix1[ 0] - pix2[ 0]];
313 s += sq[pix1[ 1] - pix2[ 1]];
314 s += sq[pix1[ 2] - pix2[ 2]];
315 s += sq[pix1[ 3] - pix2[ 3]];
316 s += sq[pix1[ 4] - pix2[ 4]];
317 s += sq[pix1[ 5] - pix2[ 5]];
318 s += sq[pix1[ 6] - pix2[ 6]];
319 s += sq[pix1[ 7] - pix2[ 7]];
320 s += sq[pix1[ 8] - pix2[ 8]];
321 s += sq[pix1[ 9] - pix2[ 9]];
322 s += sq[pix1[10] - pix2[10]];
323 s += sq[pix1[11] - pix2[11]];
324 s += sq[pix1[12] - pix2[12]];
325 s += sq[pix1[13] - pix2[13]];
326 s += sq[pix1[14] - pix2[14]];
327 s += sq[pix1[15] - pix2[15]];
328
329 pix1 += line_size;
330 pix2 += line_size;
331 }
332 return s;
333 }
334
335
336 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
337 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
338 int s, i, j;
339 const int dec_count= w==8 ? 3 : 4;
340 int tmp[32*32];
341 int level, ori;
342 static const int scale[2][2][4][4]={
343 {
344 {
345 // 9/7 8x8 dec=3
346 {268, 239, 239, 213},
347 { 0, 224, 224, 152},
348 { 0, 135, 135, 110},
349 },{
350 // 9/7 16x16 or 32x32 dec=4
351 {344, 310, 310, 280},
352 { 0, 320, 320, 228},
353 { 0, 175, 175, 136},
354 { 0, 129, 129, 102},
355 }
356 },{
357 {
358 // 5/3 8x8 dec=3
359 {275, 245, 245, 218},
360 { 0, 230, 230, 156},
361 { 0, 138, 138, 113},
362 },{
363 // 5/3 16x16 or 32x32 dec=4
364 {352, 317, 317, 286},
365 { 0, 328, 328, 233},
366 { 0, 180, 180, 140},
367 { 0, 132, 132, 105},
368 }
369 }
370 };
371
372 for (i = 0; i < h; i++) {
373 for (j = 0; j < w; j+=4) {
374 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
375 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
376 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
377 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
378 }
379 pix1 += line_size;
380 pix2 += line_size;
381 }
382
383 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
384
385 s=0;
386 assert(w==h);
387 for(level=0; level<dec_count; level++){
388 for(ori= level ? 1 : 0; ori<4; ori++){
389 int size= w>>(dec_count-level);
390 int sx= (ori&1) ? size : 0;
391 int stride= 32<<(dec_count-level);
392 int sy= (ori&2) ? stride>>1 : 0;
393
394 for(i=0; i<size; i++){
395 for(j=0; j<size; j++){
396 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
397 s += FFABS(v);
398 }
399 }
400 }
401 }
402 assert(s>=0);
403 return s>>9;
404 }
405
406 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
407 return w_c(v, pix1, pix2, line_size, 8, h, 1);
408 }
409
410 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
411 return w_c(v, pix1, pix2, line_size, 8, h, 0);
412 }
413
414 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415 return w_c(v, pix1, pix2, line_size, 16, h, 1);
416 }
417
418 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419 return w_c(v, pix1, pix2, line_size, 16, h, 0);
420 }
421
422 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423 return w_c(v, pix1, pix2, line_size, 32, h, 1);
424 }
425
426 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427 return w_c(v, pix1, pix2, line_size, 32, h, 0);
428 }
429 #endif
430
431 /* draw the edges of width 'w' of an image of size width, height */
432 //FIXME check that this is ok for mpeg4 interlaced
433 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
434 {
435 uint8_t *ptr, *last_line;
436 int i;
437
438 last_line = buf + (height - 1) * wrap;
439 for(i=0;i<w;i++) {
440 /* top and bottom */
441 memcpy(buf - (i + 1) * wrap, buf, width);
442 memcpy(last_line + (i + 1) * wrap, last_line, width);
443 }
444 /* left and right */
445 ptr = buf;
446 for(i=0;i<height;i++) {
447 memset(ptr - w, ptr[0], w);
448 memset(ptr + width, ptr[width-1], w);
449 ptr += wrap;
450 }
451 /* corners */
452 for(i=0;i<w;i++) {
453 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
454 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
455 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
456 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
457 }
458 }
459
460 /**
461 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
462 * @param buf destination buffer
463 * @param src source buffer
464 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
465 * @param block_w width of block
466 * @param block_h height of block
467 * @param src_x x coordinate of the top left sample of the block in the source buffer
468 * @param src_y y coordinate of the top left sample of the block in the source buffer
469 * @param w width of the source buffer
470 * @param h height of the source buffer
471 */
472 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
473 int src_x, int src_y, int w, int h){
474 int x, y;
475 int start_y, start_x, end_y, end_x;
476
477 if(src_y>= h){
478 src+= (h-1-src_y)*linesize;
479 src_y=h-1;
480 }else if(src_y<=-block_h){
481 src+= (1-block_h-src_y)*linesize;
482 src_y=1-block_h;
483 }
484 if(src_x>= w){
485 src+= (w-1-src_x);
486 src_x=w-1;
487 }else if(src_x<=-block_w){
488 src+= (1-block_w-src_x);
489 src_x=1-block_w;
490 }
491
492 start_y= FFMAX(0, -src_y);
493 start_x= FFMAX(0, -src_x);
494 end_y= FFMIN(block_h, h-src_y);
495 end_x= FFMIN(block_w, w-src_x);
496
497 // copy existing part
498 for(y=start_y; y<end_y; y++){
499 for(x=start_x; x<end_x; x++){
500 buf[x + y*linesize]= src[x + y*linesize];
501 }
502 }
503
504 //top
505 for(y=0; y<start_y; y++){
506 for(x=start_x; x<end_x; x++){
507 buf[x + y*linesize]= buf[x + start_y*linesize];
508 }
509 }
510
511 //bottom
512 for(y=end_y; y<block_h; y++){
513 for(x=start_x; x<end_x; x++){
514 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
515 }
516 }
517
518 for(y=0; y<block_h; y++){
519 //left
520 for(x=0; x<start_x; x++){
521 buf[x + y*linesize]= buf[start_x + y*linesize];
522 }
523
524 //right
525 for(x=end_x; x<block_w; x++){
526 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
527 }
528 }
529 }
530
531 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
532 {
533 int i;
534
535 /* read the pixels */
536 for(i=0;i<8;i++) {
537 block[0] = pixels[0];
538 block[1] = pixels[1];
539 block[2] = pixels[2];
540 block[3] = pixels[3];
541 block[4] = pixels[4];
542 block[5] = pixels[5];
543 block[6] = pixels[6];
544 block[7] = pixels[7];
545 pixels += line_size;
546 block += 8;
547 }
548 }
549
550 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
551 const uint8_t *s2, int stride){
552 int i;
553
554 /* read the pixels */
555 for(i=0;i<8;i++) {
556 block[0] = s1[0] - s2[0];
557 block[1] = s1[1] - s2[1];
558 block[2] = s1[2] - s2[2];
559 block[3] = s1[3] - s2[3];
560 block[4] = s1[4] - s2[4];
561 block[5] = s1[5] - s2[5];
562 block[6] = s1[6] - s2[6];
563 block[7] = s1[7] - s2[7];
564 s1 += stride;
565 s2 += stride;
566 block += 8;
567 }
568 }
569
570
571 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
572 int line_size)
573 {
574 int i;
575 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
576
577 /* read the pixels */
578 for(i=0;i<8;i++) {
579 pixels[0] = cm[block[0]];
580 pixels[1] = cm[block[1]];
581 pixels[2] = cm[block[2]];
582 pixels[3] = cm[block[3]];
583 pixels[4] = cm[block[4]];
584 pixels[5] = cm[block[5]];
585 pixels[6] = cm[block[6]];
586 pixels[7] = cm[block[7]];
587
588 pixels += line_size;
589 block += 8;
590 }
591 }
592
593 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
594 int line_size)
595 {
596 int i;
597 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
598
599 /* read the pixels */
600 for(i=0;i<4;i++) {
601 pixels[0] = cm[block[0]];
602 pixels[1] = cm[block[1]];
603 pixels[2] = cm[block[2]];
604 pixels[3] = cm[block[3]];
605
606 pixels += line_size;
607 block += 8;
608 }
609 }
610
611 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
612 int line_size)
613 {
614 int i;
615 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
616
617 /* read the pixels */
618 for(i=0;i<2;i++) {
619 pixels[0] = cm[block[0]];
620 pixels[1] = cm[block[1]];
621
622 pixels += line_size;
623 block += 8;
624 }
625 }
626
627 static void put_signed_pixels_clamped_c(const DCTELEM *block,
628 uint8_t *restrict pixels,
629 int line_size)
630 {
631 int i, j;
632
633 for (i = 0; i < 8; i++) {
634 for (j = 0; j < 8; j++) {
635 if (*block < -128)
636 *pixels = 0;
637 else if (*block > 127)
638 *pixels = 255;
639 else
640 *pixels = (uint8_t)(*block + 128);
641 block++;
642 pixels++;
643 }
644 pixels += (line_size - 8);
645 }
646 }
647
648 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
649 int line_size)
650 {
651 int i;
652
653 /* read the pixels */
654 for(i=0;i<8;i++) {
655 pixels[0] = block[0];
656 pixels[1] = block[1];
657 pixels[2] = block[2];
658 pixels[3] = block[3];
659 pixels[4] = block[4];
660 pixels[5] = block[5];
661 pixels[6] = block[6];
662 pixels[7] = block[7];
663
664 pixels += line_size;
665 block += 8;
666 }
667 }
668
669 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
670 int line_size)
671 {
672 int i;
673 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
674
675 /* read the pixels */
676 for(i=0;i<8;i++) {
677 pixels[0] = cm[pixels[0] + block[0]];
678 pixels[1] = cm[pixels[1] + block[1]];
679 pixels[2] = cm[pixels[2] + block[2]];
680 pixels[3] = cm[pixels[3] + block[3]];
681 pixels[4] = cm[pixels[4] + block[4]];
682 pixels[5] = cm[pixels[5] + block[5]];
683 pixels[6] = cm[pixels[6] + block[6]];
684 pixels[7] = cm[pixels[7] + block[7]];
685 pixels += line_size;
686 block += 8;
687 }
688 }
689
690 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
691 int line_size)
692 {
693 int i;
694 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
695
696 /* read the pixels */
697 for(i=0;i<4;i++) {
698 pixels[0] = cm[pixels[0] + block[0]];
699 pixels[1] = cm[pixels[1] + block[1]];
700 pixels[2] = cm[pixels[2] + block[2]];
701 pixels[3] = cm[pixels[3] + block[3]];
702 pixels += line_size;
703 block += 8;
704 }
705 }
706
707 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
708 int line_size)
709 {
710 int i;
711 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
712
713 /* read the pixels */
714 for(i=0;i<2;i++) {
715 pixels[0] = cm[pixels[0] + block[0]];
716 pixels[1] = cm[pixels[1] + block[1]];
717 pixels += line_size;
718 block += 8;
719 }
720 }
721
722 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
723 {
724 int i;
725 for(i=0;i<8;i++) {
726 pixels[0] += block[0];
727 pixels[1] += block[1];
728 pixels[2] += block[2];
729 pixels[3] += block[3];
730 pixels[4] += block[4];
731 pixels[5] += block[5];
732 pixels[6] += block[6];
733 pixels[7] += block[7];
734 pixels += line_size;
735 block += 8;
736 }
737 }
738
739 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
740 {
741 int i;
742 for(i=0;i<4;i++) {
743 pixels[0] += block[0];
744 pixels[1] += block[1];
745 pixels[2] += block[2];
746 pixels[3] += block[3];
747 pixels += line_size;
748 block += 4;
749 }
750 }
751
752 static int sum_abs_dctelem_c(DCTELEM *block)
753 {
754 int sum=0, i;
755 for(i=0; i<64; i++)
756 sum+= FFABS(block[i]);
757 return sum;
758 }
759
760 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
761 {
762 int i;
763
764 for (i = 0; i < h; i++) {
765 memset(block, value, 16);
766 block += line_size;
767 }
768 }
769
770 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
771 {
772 int i;
773
774 for (i = 0; i < h; i++) {
775 memset(block, value, 8);
776 block += line_size;
777 }
778 }
779
780 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
781 {
782 int i, j;
783 uint16_t *dst1 = dst;
784 uint16_t *dst2 = dst + linesize;
785
786 for (j = 0; j < 8; j++) {
787 for (i = 0; i < 8; i++) {
788 dst1[i] = dst2[i] = src[i] * 0x0101;
789 }
790 src += 8;
791 dst1 += linesize;
792 dst2 += linesize;
793 }
794 }
795
796 #if 0
797
798 #define PIXOP2(OPNAME, OP) \
799 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
800 {\
801 int i;\
802 for(i=0; i<h; i++){\
803 OP(*((uint64_t*)block), AV_RN64(pixels));\
804 pixels+=line_size;\
805 block +=line_size;\
806 }\
807 }\
808 \
809 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
810 {\
811 int i;\
812 for(i=0; i<h; i++){\
813 const uint64_t a= AV_RN64(pixels );\
814 const uint64_t b= AV_RN64(pixels+1);\
815 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
816 pixels+=line_size;\
817 block +=line_size;\
818 }\
819 }\
820 \
821 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
822 {\
823 int i;\
824 for(i=0; i<h; i++){\
825 const uint64_t a= AV_RN64(pixels );\
826 const uint64_t b= AV_RN64(pixels+1);\
827 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
828 pixels+=line_size;\
829 block +=line_size;\
830 }\
831 }\
832 \
833 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
834 {\
835 int i;\
836 for(i=0; i<h; i++){\
837 const uint64_t a= AV_RN64(pixels );\
838 const uint64_t b= AV_RN64(pixels+line_size);\
839 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
840 pixels+=line_size;\
841 block +=line_size;\
842 }\
843 }\
844 \
845 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
846 {\
847 int i;\
848 for(i=0; i<h; i++){\
849 const uint64_t a= AV_RN64(pixels );\
850 const uint64_t b= AV_RN64(pixels+line_size);\
851 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
852 pixels+=line_size;\
853 block +=line_size;\
854 }\
855 }\
856 \
857 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
858 {\
859 int i;\
860 const uint64_t a= AV_RN64(pixels );\
861 const uint64_t b= AV_RN64(pixels+1);\
862 uint64_t l0= (a&0x0303030303030303ULL)\
863 + (b&0x0303030303030303ULL)\
864 + 0x0202020202020202ULL;\
865 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
866 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
867 uint64_t l1,h1;\
868 \
869 pixels+=line_size;\
870 for(i=0; i<h; i+=2){\
871 uint64_t a= AV_RN64(pixels );\
872 uint64_t b= AV_RN64(pixels+1);\
873 l1= (a&0x0303030303030303ULL)\
874 + (b&0x0303030303030303ULL);\
875 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
876 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
877 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
878 pixels+=line_size;\
879 block +=line_size;\
880 a= AV_RN64(pixels );\
881 b= AV_RN64(pixels+1);\
882 l0= (a&0x0303030303030303ULL)\
883 + (b&0x0303030303030303ULL)\
884 + 0x0202020202020202ULL;\
885 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
886 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
887 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
888 pixels+=line_size;\
889 block +=line_size;\
890 }\
891 }\
892 \
893 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
894 {\
895 int i;\
896 const uint64_t a= AV_RN64(pixels );\
897 const uint64_t b= AV_RN64(pixels+1);\
898 uint64_t l0= (a&0x0303030303030303ULL)\
899 + (b&0x0303030303030303ULL)\
900 + 0x0101010101010101ULL;\
901 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
902 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
903 uint64_t l1,h1;\
904 \
905 pixels+=line_size;\
906 for(i=0; i<h; i+=2){\
907 uint64_t a= AV_RN64(pixels );\
908 uint64_t b= AV_RN64(pixels+1);\
909 l1= (a&0x0303030303030303ULL)\
910 + (b&0x0303030303030303ULL);\
911 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
912 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
913 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
914 pixels+=line_size;\
915 block +=line_size;\
916 a= AV_RN64(pixels );\
917 b= AV_RN64(pixels+1);\
918 l0= (a&0x0303030303030303ULL)\
919 + (b&0x0303030303030303ULL)\
920 + 0x0101010101010101ULL;\
921 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
922 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
923 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
924 pixels+=line_size;\
925 block +=line_size;\
926 }\
927 }\
928 \
929 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
930 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
931 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
932 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
933 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
934 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
935 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
936
937 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
938 #else // 64 bit variant
939
940 #define PIXOP2(OPNAME, OP) \
941 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
942 int i;\
943 for(i=0; i<h; i++){\
944 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
945 pixels+=line_size;\
946 block +=line_size;\
947 }\
948 }\
949 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
950 int i;\
951 for(i=0; i<h; i++){\
952 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
953 pixels+=line_size;\
954 block +=line_size;\
955 }\
956 }\
957 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
958 int i;\
959 for(i=0; i<h; i++){\
960 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
961 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
962 pixels+=line_size;\
963 block +=line_size;\
964 }\
965 }\
966 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
967 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
968 }\
969 \
970 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
971 int src_stride1, int src_stride2, int h){\
972 int i;\
973 for(i=0; i<h; i++){\
974 uint32_t a,b;\
975 a= AV_RN32(&src1[i*src_stride1 ]);\
976 b= AV_RN32(&src2[i*src_stride2 ]);\
977 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
978 a= AV_RN32(&src1[i*src_stride1+4]);\
979 b= AV_RN32(&src2[i*src_stride2+4]);\
980 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
981 }\
982 }\
983 \
984 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
985 int src_stride1, int src_stride2, int h){\
986 int i;\
987 for(i=0; i<h; i++){\
988 uint32_t a,b;\
989 a= AV_RN32(&src1[i*src_stride1 ]);\
990 b= AV_RN32(&src2[i*src_stride2 ]);\
991 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
992 a= AV_RN32(&src1[i*src_stride1+4]);\
993 b= AV_RN32(&src2[i*src_stride2+4]);\
994 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
995 }\
996 }\
997 \
998 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
999 int src_stride1, int src_stride2, int h){\
1000 int i;\
1001 for(i=0; i<h; i++){\
1002 uint32_t a,b;\
1003 a= AV_RN32(&src1[i*src_stride1 ]);\
1004 b= AV_RN32(&src2[i*src_stride2 ]);\
1005 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1006 }\
1007 }\
1008 \
1009 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1010 int src_stride1, int src_stride2, int h){\
1011 int i;\
1012 for(i=0; i<h; i++){\
1013 uint32_t a,b;\
1014 a= AV_RN16(&src1[i*src_stride1 ]);\
1015 b= AV_RN16(&src2[i*src_stride2 ]);\
1016 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1017 }\
1018 }\
1019 \
1020 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1021 int src_stride1, int src_stride2, int h){\
1022 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1023 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1024 }\
1025 \
1026 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1027 int src_stride1, int src_stride2, int h){\
1028 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1029 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1030 }\
1031 \
1032 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1033 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1034 }\
1035 \
1036 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1037 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1038 }\
1039 \
1040 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1041 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1042 }\
1043 \
1044 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1045 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1046 }\
1047 \
1048 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1049 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1050 int i;\
1051 for(i=0; i<h; i++){\
1052 uint32_t a, b, c, d, l0, l1, h0, h1;\
1053 a= AV_RN32(&src1[i*src_stride1]);\
1054 b= AV_RN32(&src2[i*src_stride2]);\
1055 c= AV_RN32(&src3[i*src_stride3]);\
1056 d= AV_RN32(&src4[i*src_stride4]);\
1057 l0= (a&0x03030303UL)\
1058 + (b&0x03030303UL)\
1059 + 0x02020202UL;\
1060 h0= ((a&0xFCFCFCFCUL)>>2)\
1061 + ((b&0xFCFCFCFCUL)>>2);\
1062 l1= (c&0x03030303UL)\
1063 + (d&0x03030303UL);\
1064 h1= ((c&0xFCFCFCFCUL)>>2)\
1065 + ((d&0xFCFCFCFCUL)>>2);\
1066 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1067 a= AV_RN32(&src1[i*src_stride1+4]);\
1068 b= AV_RN32(&src2[i*src_stride2+4]);\
1069 c= AV_RN32(&src3[i*src_stride3+4]);\
1070 d= AV_RN32(&src4[i*src_stride4+4]);\
1071 l0= (a&0x03030303UL)\
1072 + (b&0x03030303UL)\
1073 + 0x02020202UL;\
1074 h0= ((a&0xFCFCFCFCUL)>>2)\
1075 + ((b&0xFCFCFCFCUL)>>2);\
1076 l1= (c&0x03030303UL)\
1077 + (d&0x03030303UL);\
1078 h1= ((c&0xFCFCFCFCUL)>>2)\
1079 + ((d&0xFCFCFCFCUL)>>2);\
1080 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1081 }\
1082 }\
1083 \
1084 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1085 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1086 }\
1087 \
1088 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1089 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1090 }\
1091 \
1092 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1093 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1094 }\
1095 \
1096 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1097 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1098 }\
1099 \
1100 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1101 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1102 int i;\
1103 for(i=0; i<h; i++){\
1104 uint32_t a, b, c, d, l0, l1, h0, h1;\
1105 a= AV_RN32(&src1[i*src_stride1]);\
1106 b= AV_RN32(&src2[i*src_stride2]);\
1107 c= AV_RN32(&src3[i*src_stride3]);\
1108 d= AV_RN32(&src4[i*src_stride4]);\
1109 l0= (a&0x03030303UL)\
1110 + (b&0x03030303UL)\
1111 + 0x01010101UL;\
1112 h0= ((a&0xFCFCFCFCUL)>>2)\
1113 + ((b&0xFCFCFCFCUL)>>2);\
1114 l1= (c&0x03030303UL)\
1115 + (d&0x03030303UL);\
1116 h1= ((c&0xFCFCFCFCUL)>>2)\
1117 + ((d&0xFCFCFCFCUL)>>2);\
1118 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1119 a= AV_RN32(&src1[i*src_stride1+4]);\
1120 b= AV_RN32(&src2[i*src_stride2+4]);\
1121 c= AV_RN32(&src3[i*src_stride3+4]);\
1122 d= AV_RN32(&src4[i*src_stride4+4]);\
1123 l0= (a&0x03030303UL)\
1124 + (b&0x03030303UL)\
1125 + 0x01010101UL;\
1126 h0= ((a&0xFCFCFCFCUL)>>2)\
1127 + ((b&0xFCFCFCFCUL)>>2);\
1128 l1= (c&0x03030303UL)\
1129 + (d&0x03030303UL);\
1130 h1= ((c&0xFCFCFCFCUL)>>2)\
1131 + ((d&0xFCFCFCFCUL)>>2);\
1132 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1133 }\
1134 }\
1135 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1136 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1137 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1138 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1139 }\
1140 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1141 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1142 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1143 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1144 }\
1145 \
1146 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1147 {\
1148 int i, a0, b0, a1, b1;\
1149 a0= pixels[0];\
1150 b0= pixels[1] + 2;\
1151 a0 += b0;\
1152 b0 += pixels[2];\
1153 \
1154 pixels+=line_size;\
1155 for(i=0; i<h; i+=2){\
1156 a1= pixels[0];\
1157 b1= pixels[1];\
1158 a1 += b1;\
1159 b1 += pixels[2];\
1160 \
1161 block[0]= (a1+a0)>>2; /* FIXME non put */\
1162 block[1]= (b1+b0)>>2;\
1163 \
1164 pixels+=line_size;\
1165 block +=line_size;\
1166 \
1167 a0= pixels[0];\
1168 b0= pixels[1] + 2;\
1169 a0 += b0;\
1170 b0 += pixels[2];\
1171 \
1172 block[0]= (a1+a0)>>2;\
1173 block[1]= (b1+b0)>>2;\
1174 pixels+=line_size;\
1175 block +=line_size;\
1176 }\
1177 }\
1178 \
1179 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1180 {\
1181 int i;\
1182 const uint32_t a= AV_RN32(pixels );\
1183 const uint32_t b= AV_RN32(pixels+1);\
1184 uint32_t l0= (a&0x03030303UL)\
1185 + (b&0x03030303UL)\
1186 + 0x02020202UL;\
1187 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1188 + ((b&0xFCFCFCFCUL)>>2);\
1189 uint32_t l1,h1;\
1190 \
1191 pixels+=line_size;\
1192 for(i=0; i<h; i+=2){\
1193 uint32_t a= AV_RN32(pixels );\
1194 uint32_t b= AV_RN32(pixels+1);\
1195 l1= (a&0x03030303UL)\
1196 + (b&0x03030303UL);\
1197 h1= ((a&0xFCFCFCFCUL)>>2)\
1198 + ((b&0xFCFCFCFCUL)>>2);\
1199 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1200 pixels+=line_size;\
1201 block +=line_size;\
1202 a= AV_RN32(pixels );\
1203 b= AV_RN32(pixels+1);\
1204 l0= (a&0x03030303UL)\
1205 + (b&0x03030303UL)\
1206 + 0x02020202UL;\
1207 h0= ((a&0xFCFCFCFCUL)>>2)\
1208 + ((b&0xFCFCFCFCUL)>>2);\
1209 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1210 pixels+=line_size;\
1211 block +=line_size;\
1212 }\
1213 }\
1214 \
1215 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1216 {\
1217 int j;\
1218 for(j=0; j<2; j++){\
1219 int i;\
1220 const uint32_t a= AV_RN32(pixels );\
1221 const uint32_t b= AV_RN32(pixels+1);\
1222 uint32_t l0= (a&0x03030303UL)\
1223 + (b&0x03030303UL)\
1224 + 0x02020202UL;\
1225 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1226 + ((b&0xFCFCFCFCUL)>>2);\
1227 uint32_t l1,h1;\
1228 \
1229 pixels+=line_size;\
1230 for(i=0; i<h; i+=2){\
1231 uint32_t a= AV_RN32(pixels );\
1232 uint32_t b= AV_RN32(pixels+1);\
1233 l1= (a&0x03030303UL)\
1234 + (b&0x03030303UL);\
1235 h1= ((a&0xFCFCFCFCUL)>>2)\
1236 + ((b&0xFCFCFCFCUL)>>2);\
1237 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1238 pixels+=line_size;\
1239 block +=line_size;\
1240 a= AV_RN32(pixels );\
1241 b= AV_RN32(pixels+1);\
1242 l0= (a&0x03030303UL)\
1243 + (b&0x03030303UL)\
1244 + 0x02020202UL;\
1245 h0= ((a&0xFCFCFCFCUL)>>2)\
1246 + ((b&0xFCFCFCFCUL)>>2);\
1247 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1248 pixels+=line_size;\
1249 block +=line_size;\
1250 }\
1251 pixels+=4-line_size*(h+1);\
1252 block +=4-line_size*h;\
1253 }\
1254 }\
1255 \
1256 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1257 {\
1258 int j;\
1259 for(j=0; j<2; j++){\
1260 int i;\
1261 const uint32_t a= AV_RN32(pixels );\
1262 const uint32_t b= AV_RN32(pixels+1);\
1263 uint32_t l0= (a&0x03030303UL)\
1264 + (b&0x03030303UL)\
1265 + 0x01010101UL;\
1266 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1267 + ((b&0xFCFCFCFCUL)>>2);\
1268 uint32_t l1,h1;\
1269 \
1270 pixels+=line_size;\
1271 for(i=0; i<h; i+=2){\
1272 uint32_t a= AV_RN32(pixels );\
1273 uint32_t b= AV_RN32(pixels+1);\
1274 l1= (a&0x03030303UL)\
1275 + (b&0x03030303UL);\
1276 h1= ((a&0xFCFCFCFCUL)>>2)\
1277 + ((b&0xFCFCFCFCUL)>>2);\
1278 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1279 pixels+=line_size;\
1280 block +=line_size;\
1281 a= AV_RN32(pixels );\
1282 b= AV_RN32(pixels+1);\
1283 l0= (a&0x03030303UL)\
1284 + (b&0x03030303UL)\
1285 + 0x01010101UL;\
1286 h0= ((a&0xFCFCFCFCUL)>>2)\
1287 + ((b&0xFCFCFCFCUL)>>2);\
1288 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1289 pixels+=line_size;\
1290 block +=line_size;\
1291 }\
1292 pixels+=4-line_size*(h+1);\
1293 block +=4-line_size*h;\
1294 }\
1295 }\
1296 \
1297 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1298 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1299 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1300 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1301 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1302 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1303 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1304 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1305
1306 #define op_avg(a, b) a = rnd_avg32(a, b)
1307 #endif
1308 #define op_put(a, b) a = b
1309
1310 PIXOP2(avg, op_avg)
1311 PIXOP2(put, op_put)
1312 #undef op_avg
1313 #undef op_put
1314
1315 #define avg2(a,b) ((a+b+1)>>1)
1316 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1317
1318 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1319 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1320 }
1321
1322 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1323 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1324 }
1325
1326 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1327 {
1328 const int A=(16-x16)*(16-y16);
1329 const int B=( x16)*(16-y16);
1330 const int C=(16-x16)*( y16);
1331 const int D=( x16)*( y16);
1332 int i;
1333
1334 for(i=0; i<h; i++)
1335 {
1336 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1337 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1338 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1339 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1340 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1341 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1342 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1343 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1344 dst+= stride;
1345 src+= stride;
1346 }
1347 }
1348
1349 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1350 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1351 {
1352 int y, vx, vy;
1353 const int s= 1<<shift;
1354
1355 width--;
1356 height--;
1357
1358 for(y=0; y<h; y++){
1359 int x;
1360
1361 vx= ox;
1362 vy= oy;
1363 for(x=0; x<8; x++){ //XXX FIXME optimize
1364 int src_x, src_y, frac_x, frac_y, index;
1365
1366 src_x= vx>>16;
1367 src_y= vy>>16;
1368 frac_x= src_x&(s-1);
1369 frac_y= src_y&(s-1);
1370 src_x>>=shift;
1371 src_y>>=shift;
1372
1373 if((unsigned)src_x < width){
1374 if((unsigned)src_y < height){
1375 index= src_x + src_y*stride;
1376 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1377 + src[index +1]* frac_x )*(s-frac_y)
1378 + ( src[index+stride ]*(s-frac_x)
1379 + src[index+stride+1]* frac_x )* frac_y
1380 + r)>>(shift*2);
1381 }else{
1382 index= src_x + av_clip(src_y, 0, height)*stride;
1383 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1384 + src[index +1]* frac_x )*s
1385 + r)>>(shift*2);
1386 }
1387 }else{
1388 if((unsigned)src_y < height){
1389 index= av_clip(src_x, 0, width) + src_y*stride;
1390 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1391 + src[index+stride ]* frac_y )*s
1392 + r)>>(shift*2);
1393 }else{
1394 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1395 dst[y*stride + x]= src[index ];
1396 }
1397 }
1398
1399 vx+= dxx;
1400 vy+= dyx;
1401 }
1402 ox += dxy;
1403 oy += dyy;
1404 }
1405 }
1406
1407 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1408 switch(width){
1409 case 2: put_pixels2_c (dst, src, stride, height); break;
1410 case 4: put_pixels4_c (dst, src, stride, height); break;
1411 case 8: put_pixels8_c (dst, src, stride, height); break;
1412 case 16:put_pixels16_c(dst, src, stride, height); break;
1413 }
1414 }
1415
1416 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1417 int i,j;
1418 for (i=0; i < height; i++) {
1419 for (j=0; j < width; j++) {
1420 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1421 }
1422 src += stride;
1423 dst += stride;
1424 }
1425 }
1426
1427 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1428 int i,j;
1429 for (i=0; i < height; i++) {
1430 for (j=0; j < width; j++) {
1431 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1432 }
1433 src += stride;
1434 dst += stride;
1435 }
1436 }
1437
1438 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1439 int i,j;
1440 for (i=0; i < height; i++) {
1441 for (j=0; j < width; j++) {
1442 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1443 }
1444 src += stride;
1445 dst += stride;
1446 }
1447 }
1448
1449 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1450 int i,j;
1451 for (i=0; i < height; i++) {
1452 for (j=0; j < width; j++) {
1453 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1454 }
1455 src += stride;
1456 dst += stride;
1457 }
1458 }
1459
1460 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1461 int i,j;
1462 for (i=0; i < height; i++) {
1463 for (j=0; j < width; j++) {
1464 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1465 }
1466 src += stride;
1467 dst += stride;
1468 }
1469 }
1470
1471 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1472 int i,j;
1473 for (i=0; i < height; i++) {
1474 for (j=0; j < width; j++) {
1475 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1476 }
1477 src += stride;
1478 dst += stride;
1479 }
1480 }
1481
1482 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1483 int i,j;
1484 for (i=0; i < height; i++) {
1485 for (j=0; j < width; j++) {
1486 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1487 }
1488 src += stride;
1489 dst += stride;
1490 }
1491 }
1492
1493 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1494 int i,j;
1495 for (i=0; i < height; i++) {
1496 for (j=0; j < width; j++) {
1497 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1498 }
1499 src += stride;
1500 dst += stride;
1501 }
1502 }
1503
1504 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1505 switch(width){
1506 case 2: avg_pixels2_c (dst, src, stride, height); break;
1507 case 4: avg_pixels4_c (dst, src, stride, height); break;
1508 case 8: avg_pixels8_c (dst, src, stride, height); break;
1509 case 16:avg_pixels16_c(dst, src, stride, height); break;
1510 }
1511 }
1512
1513 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1514 int i,j;
1515 for (i=0; i < height; i++) {
1516 for (j=0; j < width; j++) {
1517 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1518 }
1519 src += stride;
1520 dst += stride;
1521 }
1522 }
1523
1524 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1525 int i,j;
1526 for (i=0; i < height; i++) {
1527 for (j=0; j < width; j++) {
1528 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1529 }
1530 src += stride;
1531 dst += stride;
1532 }
1533 }
1534
1535 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1536 int i,j;
1537 for (i=0; i < height; i++) {
1538 for (j=0; j < width; j++) {
1539 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1540 }
1541 src += stride;
1542 dst += stride;
1543 }
1544 }
1545
1546 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1547 int i,j;
1548 for (i=0; i < height; i++) {
1549 for (j=0; j < width; j++) {
1550 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1551 }
1552 src += stride;
1553 dst += stride;
1554 }
1555 }
1556
1557 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1558 int i,j;
1559 for (i=0; i < height; i++) {
1560 for (j=0; j < width; j++) {
1561 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1562 }
1563 src += stride;
1564 dst += stride;
1565 }
1566 }
1567
1568 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1569 int i,j;
1570 for (i=0; i < height; i++) {
1571 for (j=0; j < width; j++) {
1572 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1573 }
1574 src += stride;
1575 dst += stride;
1576 }
1577 }
1578
1579 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1580 int i,j;
1581 for (i=0; i < height; i++) {
1582 for (j=0; j < width; j++) {
1583 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1584 }
1585 src += stride;
1586 dst += stride;
1587 }
1588 }
1589
1590 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1591 int i,j;
1592 for (i=0; i < height; i++) {
1593 for (j=0; j < width; j++) {
1594 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1595 }
1596 src += stride;
1597 dst += stride;
1598 }
1599 }
1600 #if 0
1601 #define TPEL_WIDTH(width)\
1602 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1603 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1604 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1605 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1606 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1607 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1608 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1609 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1610 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1611 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1612 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1613 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1614 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1615 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1616 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1617 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1618 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1619 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1620 #endif
1621
1622 #define H264_CHROMA_MC(OPNAME, OP)\
1623 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1624 const int A=(8-x)*(8-y);\
1625 const int B=( x)*(8-y);\
1626 const int C=(8-x)*( y);\
1627 const int D=( x)*( y);\
1628 int i;\
1629 \
1630 assert(x<8 && y<8 && x>=0 && y>=0);\
1631 \
1632 if(D){\
1633 for(i=0; i<h; i++){\
1634 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1635 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1636 dst+= stride;\
1637 src+= stride;\
1638 }\
1639 }else{\
1640 const int E= B+C;\
1641 const int step= C ? stride : 1;\
1642 for(i=0; i<h; i++){\
1643 OP(dst[0], (A*src[0] + E*src[step+0]));\
1644 OP(dst[1], (A*src[1] + E*src[step+1]));\
1645 dst+= stride;\
1646 src+= stride;\
1647 }\
1648 }\
1649 }\
1650 \
1651 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1652 const int A=(8-x)*(8-y);\
1653 const int B=( x)*(8-y);\
1654 const int C=(8-x)*( y);\
1655 const int D=( x)*( y);\
1656 int i;\
1657 \
1658 assert(x<8 && y<8 && x>=0 && y>=0);\
1659 \
1660 if(D){\
1661 for(i=0; i<h; i++){\
1662 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1663 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1664 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1665 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1666 dst+= stride;\
1667 src+= stride;\
1668 }\
1669 }else{\
1670 const int E= B+C;\
1671 const int step= C ? stride : 1;\
1672 for(i=0; i<h; i++){\
1673 OP(dst[0], (A*src[0] + E*src[step+0]));\
1674 OP(dst[1], (A*src[1] + E*src[step+1]));\
1675 OP(dst[2], (A*src[2] + E*src[step+2]));\
1676 OP(dst[3], (A*src[3] + E*src[step+3]));\
1677 dst+= stride;\
1678 src+= stride;\
1679 }\
1680 }\
1681 }\
1682 \
1683 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1684 const int A=(8-x)*(8-y);\
1685 const int B=( x)*(8-y);\
1686 const int C=(8-x)*( y);\
1687 const int D=( x)*( y);\
1688 int i;\
1689 \
1690 assert(x<8 && y<8 && x>=0 && y>=0);\
1691 \
1692 if(D){\
1693 for(i=0; i<h; i++){\
1694 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1695 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1696 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1697 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1698 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1699 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1700 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1701 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1702 dst+= stride;\
1703 src+= stride;\
1704 }\
1705 }else{\
1706 const int E= B+C;\
1707 const int step= C ? stride : 1;\
1708 for(i=0; i<h; i++){\
1709 OP(dst[0], (A*src[0] + E*src[step+0]));\
1710 OP(dst[1], (A*src[1] + E*src[step+1]));\
1711 OP(dst[2], (A*src[2] + E*src[step+2]));\
1712 OP(dst[3], (A*src[3] + E*src[step+3]));\
1713 OP(dst[4], (A*src[4] + E*src[step+4]));\
1714 OP(dst[5], (A*src[5] + E*src[step+5]));\
1715 OP(dst[6], (A*src[6] + E*src[step+6]));\
1716 OP(dst[7], (A*src[7] + E*src[step+7]));\
1717 dst+= stride;\
1718 src+= stride;\
1719 }\
1720 }\
1721 }
1722
1723 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1724 #define op_put(a, b) a = (((b) + 32)>>6)
1725
1726 H264_CHROMA_MC(put_ , op_put)
1727 H264_CHROMA_MC(avg_ , op_avg)
1728 #undef op_avg
1729 #undef op_put
1730
1731 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1732 const int A=(8-x)*(8-y);
1733 const int B=( x)*(8-y);
1734 const int C=(8-x)*( y);
1735 const int D=( x)*( y);
1736 int i;
1737
1738 assert(x<8 && y<8 && x>=0 && y>=0);
1739
1740 for(i=0; i<h; i++)
1741 {
1742 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1743 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1744 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1745 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1746 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1747 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1748 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1749 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1750 dst+= stride;
1751 src+= stride;
1752 }
1753 }
1754
1755 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1756 const int A=(8-x)*(8-y);
1757 const int B=( x)*(8-y);
1758 const int C=(8-x)*( y);
1759 const int D=( x)*( y);
1760 int i;
1761
1762 assert(x<8 && y<8 && x>=0 && y>=0);
1763
1764 for(i=0; i<h; i++)
1765 {
1766 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1767 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1768 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1769 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1770 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1771 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1772 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1773 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1774 dst+= stride;
1775 src+= stride;
1776 }
1777 }
1778
1779 #define QPEL_MC(r, OPNAME, RND, OP) \
1780 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1781 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1782 int i;\
1783 for(i=0; i<h; i++)\
1784 {\
1785 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1786 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1787 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1788 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1789 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1790 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1791 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1792 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1793 dst+=dstStride;\
1794 src+=srcStride;\
1795 }\
1796 }\
1797 \
1798 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1799 const int w=8;\
1800 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1801 int i;\
1802 for(i=0; i<w; i++)\
1803 {\
1804 const int src0= src[0*srcStride];\
1805 const int src1= src[1*srcStride];\
1806 const int src2= src[2*srcStride];\
1807 const int src3= src[3*srcStride];\
1808 const int src4= src[4*srcStride];\
1809 const int src5= src[5*srcStride];\
1810 const int src6= src[6*srcStride];\
1811 const int src7= src[7*srcStride];\
1812 const int src8= src[8*srcStride];\
1813 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1814 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1815 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1816 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1817 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1818 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1819 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1820 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1821 dst++;\
1822 src++;\
1823 }\
1824 }\
1825 \
1826 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1827 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1828 int i;\
1829 \
1830 for(i=0; i<h; i++)\
1831 {\
1832 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1833 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1834 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1835 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1836 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1837 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1838 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1839 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1840 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1841 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1842 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1843 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1844 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1845 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1846 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1847 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1848 dst+=dstStride;\
1849 src+=srcStride;\
1850 }\
1851 }\
1852 \
1853 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1854 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1855 int i;\
1856 const int w=16;\
1857 for(i=0; i<w; i++)\
1858 {\
1859 const int src0= src[0*srcStride];\
1860 const int src1= src[1*srcStride];\
1861 const int src2= src[2*srcStride];\
1862 const int src3= src[3*srcStride];\
1863 const int src4= src[4*srcStride];\
1864 const int src5= src[5*srcStride];\
1865 const int src6= src[6*srcStride];\
1866 const int src7= src[7*srcStride];\
1867 const int src8= src[8*srcStride];\
1868 const int src9= src[9*srcStride];\
1869 const int src10= src[10*srcStride];\
1870 const int src11= src[11*srcStride];\
1871 const int src12= src[12*srcStride];\
1872 const int src13= src[13*srcStride];\
1873 const int src14= src[14*srcStride];\
1874 const int src15= src[15*srcStride];\
1875 const int src16= src[16*srcStride];\
1876 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1877 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1878 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1879 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1880 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1881 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1882 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1883 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1884 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1885 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1886 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1887 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1888 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1889 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1890 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1891 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1892 dst++;\
1893 src++;\
1894 }\
1895 }\
1896 \
1897 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1898 OPNAME ## pixels8_c(dst, src, stride, 8);\
1899 }\
1900 \
1901 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1902 uint8_t half[64];\
1903 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1904 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1905 }\
1906 \
1907 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1908 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1909 }\
1910 \
1911 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1912 uint8_t half[64];\
1913 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1914 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1915 }\
1916 \
1917 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1918 uint8_t full[16*9];\
1919 uint8_t half[64];\
1920 copy_block9(full, src, 16, stride, 9);\
1921 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1922 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1923 }\
1924 \
1925 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1926 uint8_t full[16*9];\
1927 copy_block9(full, src, 16, stride, 9);\
1928 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1929 }\
1930 \
1931 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1932 uint8_t full[16*9];\
1933 uint8_t half[64];\
1934 copy_block9(full, src, 16, stride, 9);\
1935 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1936 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1937 }\
1938 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1939 uint8_t full[16*9];\
1940 uint8_t halfH[72];\
1941 uint8_t halfV[64];\
1942 uint8_t halfHV[64];\
1943 copy_block9(full, src, 16, stride, 9);\
1944 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1945 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1946 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1947 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1948 }\
1949 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1950 uint8_t full[16*9];\
1951 uint8_t halfH[72];\
1952 uint8_t halfHV[64];\
1953 copy_block9(full, src, 16, stride, 9);\
1954 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1955 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1956 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1957 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1958 }\
1959 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1960 uint8_t full[16*9];\
1961 uint8_t halfH[72];\
1962 uint8_t halfV[64];\
1963 uint8_t halfHV[64];\
1964 copy_block9(full, src, 16, stride, 9);\
1965 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1966 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1967 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1968 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1969 }\
1970 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1971 uint8_t full[16*9];\
1972 uint8_t halfH[72];\
1973 uint8_t halfHV[64];\
1974 copy_block9(full, src, 16, stride, 9);\
1975 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1976 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1977 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1978 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1979 }\
1980 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1981 uint8_t full[16*9];\
1982 uint8_t halfH[72];\
1983 uint8_t halfV[64];\
1984 uint8_t halfHV[64];\
1985 copy_block9(full, src, 16, stride, 9);\
1986 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1987 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1988 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1989 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1990 }\
1991 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1992 uint8_t full[16*9];\
1993 uint8_t halfH[72];\
1994 uint8_t halfHV[64];\
1995 copy_block9(full, src, 16, stride, 9);\
1996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1998 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1999 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2000 }\
2001 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2002 uint8_t full[16*9];\
2003 uint8_t halfH[72];\
2004 uint8_t halfV[64];\
2005 uint8_t halfHV[64];\
2006 copy_block9(full, src, 16, stride, 9);\
2007 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
2008 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2009 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2010 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2011 }\
2012 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2013 uint8_t full[16*9];\
2014 uint8_t halfH[72];\
2015 uint8_t halfHV[64];\
2016 copy_block9(full, src, 16, stride, 9);\
2017 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2018 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2019 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2020 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2021 }\
2022 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2023 uint8_t halfH[72];\
2024 uint8_t halfHV[64];\
2025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2026 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2027 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
2028 }\
2029 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2030 uint8_t halfH[72];\
2031 uint8_t halfHV[64];\
2032 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2033 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2034 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2035 }\
2036 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2037 uint8_t full[16*9];\
2038 uint8_t halfH[72];\
2039 uint8_t halfV[64];\
2040 uint8_t halfHV[64];\
2041 copy_block9(full, src, 16, stride, 9);\
2042 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2043 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2044 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2045 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2046 }\
2047 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2048 uint8_t full[16*9];\
2049 uint8_t halfH[72];\
2050 copy_block9(full, src, 16, stride, 9);\
2051 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2052 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2053 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2054 }\
2055 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2056 uint8_t full[16*9];\
2057 uint8_t halfH[72];\
2058 uint8_t halfV[64];\
2059 uint8_t halfHV[64];\
2060 copy_block9(full, src, 16, stride, 9);\
2061 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2062 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2063 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2064 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2065 }\
2066 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2067 uint8_t full[16*9];\
2068 uint8_t halfH[72];\
2069 copy_block9(full, src, 16, stride, 9);\
2070 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2071 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2072 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2073 }\
2074 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2075 uint8_t halfH[72];\
2076 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2077 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2078 }\
2079 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2080 OPNAME ## pixels16_c(dst, src, stride, 16);\
2081 }\
2082 \
2083 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2084 uint8_t half[256];\
2085 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2086 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2087 }\
2088 \
2089 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2090 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2091 }\
2092 \
2093 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2094 uint8_t half[256];\
2095 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2096 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2097 }\
2098 \
2099 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2100 uint8_t full[24*17];\
2101 uint8_t half[256];\
2102 copy_block17(full, src, 24, stride, 17);\
2103 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2104 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2105 }\
2106 \
2107 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2108 uint8_t full[24*17];\
2109 copy_block17(full, src, 24, stride, 17);\
2110 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2111 }\
2112 \
2113 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2114 uint8_t full[24*17];\
2115 uint8_t half[256];\
2116 copy_block17(full, src, 24, stride, 17);\
2117 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2118 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2119 }\
2120 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2121 uint8_t full[24*17];\
2122 uint8_t halfH[272];\
2123 uint8_t halfV[256];\
2124 uint8_t halfHV[256];\
2125 copy_block17(full, src, 24, stride, 17);\
2126 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2127 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2128 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2129 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2130 }\
2131 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2132 uint8_t full[24*17];\
2133 uint8_t halfH[272];\
2134 uint8_t halfHV[256];\
2135 copy_block17(full, src, 24, stride, 17);\
2136 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2137 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2138 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2139 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2140 }\
2141 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2142 uint8_t full[24*17];\
2143 uint8_t halfH[272];\
2144 uint8_t halfV[256];\
2145 uint8_t halfHV[256];\
2146 copy_block17(full, src, 24, stride, 17);\
2147 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2148 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2149 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2150 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2151 }\
2152 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2153 uint8_t full[24*17];\
2154 uint8_t halfH[272];\
2155 uint8_t halfHV[256];\
2156 copy_block17(full, src, 24, stride, 17);\
2157 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2158 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2159 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2160 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2161 }\
2162 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2163 uint8_t full[24*17];\
2164 uint8_t halfH[272];\
2165 uint8_t halfV[256];\
2166 uint8_t halfHV[256];\
2167 copy_block17(full, src, 24, stride, 17);\
2168 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2169 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2170 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2171 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2172 }\
2173 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2174 uint8_t full[24*17];\
2175 uint8_t halfH[272];\
2176 uint8_t halfHV[256];\
2177 copy_block17(full, src, 24, stride, 17);\
2178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2180 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2181 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2182 }\
2183 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2184 uint8_t full[24*17];\
2185 uint8_t halfH[272];\
2186 uint8_t halfV[256];\
2187 uint8_t halfHV[256];\
2188 copy_block17(full, src, 24, stride, 17);\
2189 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2190 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2191 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2192 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2193 }\
2194 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2195 uint8_t full[24*17];\
2196 uint8_t halfH[272];\
2197 uint8_t halfHV[256];\
2198 copy_block17(full, src, 24, stride, 17);\
2199 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2200 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2201 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2202 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2203 }\
2204 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2205 uint8_t halfH[272];\
2206 uint8_t halfHV[256];\
2207 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2208 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2209 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2210 }\
2211 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2212 uint8_t halfH[272];\
2213 uint8_t halfHV[256];\
2214 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2215 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2216 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2217 }\
2218 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2219 uint8_t full[24*17];\
2220 uint8_t halfH[272];\
2221 uint8_t halfV[256];\
2222 uint8_t halfHV[256];\
2223 copy_block17(full, src, 24, stride, 17);\
2224 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2225 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2226 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2227 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2228 }\
2229 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2230 uint8_t full[24*17];\
2231 uint8_t halfH[272];\
2232 copy_block17(full, src, 24, stride, 17);\
2233 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2234 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2235 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2236 }\
2237 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2238 uint8_t full[24*17];\
2239 uint8_t halfH[272];\
2240 uint8_t halfV[256];\
2241 uint8_t halfHV[256];\
2242 copy_block17(full, src, 24, stride, 17);\
2243 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2244 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2245 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2246 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2247 }\
2248 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2249 uint8_t full[24*17];\
2250 uint8_t halfH[272];\
2251 copy_block17(full, src, 24, stride, 17);\
2252 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2253 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2254 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2255 }\
2256 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2257 uint8_t halfH[272];\
2258 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2259 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2260 }
2261
2262 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2263 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2264 #define op_put(a, b) a = cm[((b) + 16)>>5]
2265 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2266
2267 QPEL_MC(0, put_ , _ , op_put)
2268 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2269 QPEL_MC(0, avg_ , _ , op_avg)
2270 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2271 #undef op_avg
2272 #undef op_avg_no_rnd
2273 #undef op_put
2274 #undef op_put_no_rnd
2275
2276 #if 1
2277 #define H264_LOWPASS(OPNAME, OP, OP2) \
2278 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2279 const int h=2;\
2280 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2281 int i;\
2282 for(i=0; i<h; i++)\
2283 {\
2284 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2285 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2286 dst+=dstStride;\
2287 src+=srcStride;\
2288 }\
2289 }\
2290 \
2291 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2292 const int w=2;\
2293 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2294 int i;\
2295 for(i=0; i<w; i++)\
2296 {\
2297 const int srcB= src[-2*srcStride];\
2298 const int srcA= src[-1*srcStride];\
2299 const int src0= src[0 *srcStride];\
2300 const int src1= src[1 *srcStride];\
2301 const int src2= src[2 *srcStride];\
2302 const int src3= src[3 *srcStride];\
2303 const int src4= src[4 *srcStride];\
2304 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2305 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2306 dst++;\
2307 src++;\
2308 }\
2309 }\
2310 \
2311 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2312 const int h=2;\
2313 const int w=2;\
2314 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2315 int i;\
2316 src -= 2*srcStride;\
2317 for(i=0; i<h+5; i++)\
2318 {\
2319 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2320 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2321 tmp+=tmpStride;\
2322 src+=srcStride;\
2323 }\
2324 tmp -= tmpStride*(h+5-2);\
2325 for(i=0; i<w; i++)\
2326 {\
2327 const int tmpB= tmp[-2*tmpStride];\
2328 const int tmpA= tmp[-1*tmpStride];\
2329 const int tmp0= tmp[0 *tmpStride];\
2330 const int tmp1= tmp[1 *tmpStride];\
2331 const int tmp2= tmp[2 *tmpStride];\
2332 const int tmp3= tmp[3 *tmpStride];\
2333 const int tmp4= tmp[4 *tmpStride];\
2334 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2335 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2336 dst++;\
2337 tmp++;\
2338 }\
2339 }\
2340 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2341 const int h=4;\
2342 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2343 int i;\
2344 for(i=0; i<h; i++)\
2345 {\
2346 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2347 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2348 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2349 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2350 dst+=dstStride;\
2351 src+=srcStride;\
2352 }\
2353 }\
2354 \
2355 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2356 const int w=4;\
2357 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2358 int i;\
2359 for(i=0; i<w; i++)\
2360 {\
2361 const int srcB= src[-2*srcStride];\
2362 const int srcA= src[-1*srcStride];\
2363 const int src0= src[0 *srcStride];\
2364 const int src1= src[1 *srcStride];\
2365 const int src2= src[2 *srcStride];\
2366 const int src3= src[3 *srcStride];\
2367 const int src4= src[4 *srcStride];\
2368 const int src5= src[5 *srcStride];\
2369 const int src6= src[6 *srcStride];\
2370 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2371 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2372 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2373 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2374 dst++;\
2375 src++;\
2376 }\
2377 }\
2378 \
2379 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2380 const int h=4;\
2381 const int w=4;\
2382 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2383 int i;\
2384 src -= 2*srcStride;\
2385 for(i=0; i<h+5; i++)\
2386 {\
2387 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2388 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2389 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2390 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2391 tmp+=tmpStride;\
2392 src+=srcStride;\
2393 }\
2394 tmp -= tmpStride*(h+5-2);\
2395 for(i=0; i<w; i++)\
2396 {\
2397 const int tmpB= tmp[-2*tmpStride];\
2398 const int tmpA= tmp[-1*tmpStride];\
2399 const int tmp0= tmp[0 *tmpStride];\
2400 const int tmp1= tmp[1 *tmpStride];\
2401 const int tmp2= tmp[2 *tmpStride];\
2402 const int tmp3= tmp[3 *tmpStride];\
2403 const int tmp4= tmp[4 *tmpStride];\
2404 const int tmp5= tmp[5 *tmpStride];\
2405 const int tmp6= tmp[6 *tmpStride];\
2406 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2407 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2408 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2409 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2410 dst++;\
2411 tmp++;\
2412 }\
2413 }\
2414 \
2415 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2416 const int h=8;\
2417 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2418 int i;\
2419 for(i=0; i<h; i++)\
2420 {\
2421 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2422 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2423 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2424 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2425 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2426 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2427 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2428 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2429 dst+=dstStride;\
2430 src+=srcStride;\
2431 }\
2432 }\
2433 \
2434 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2435 const int w=8;\
2436 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2437 int i;\
2438 for(i=0; i<w; i++)\
2439 {\
2440 const int srcB= src[-2*srcStride];\
2441 const int srcA= src[-1*srcStride];\
2442 const int src0= src[0 *srcStride];\
2443 const int src1= src[1 *srcStride];\
2444 const int src2= src[2 *srcStride];\
2445 const int src3= src[3 *srcStride];\
2446 const int src4= src[4 *srcStride];\
2447 const int src5= src[5 *srcStride];\
2448 const int src6= src[6 *srcStride];\
2449 const int src7= src[7 *srcStride];\
2450 const int src8= src[8 *srcStride];\
2451 const int src9= src[9 *srcStride];\
2452 const int src10=src[10*srcStride];\
2453 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2454 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2455 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2456 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2457 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2458 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2459 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2460 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2461 dst++;\
2462 src++;\
2463 }\
2464 }\
2465 \
2466 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2467 const int h=8;\
2468 const int w=8;\
2469 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2470 int i;\
2471 src -= 2*srcStride;\
2472 for(i=0; i<h+5; i++)\
2473 {\
2474 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2475 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2476 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2477 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2478 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2479 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2480 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2481 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2482 tmp+=tmpStride;\
2483 src+=srcStride;\
2484 }\
2485 tmp -= tmpStride*(h+5-2);\
2486 for(i=0; i<w; i++)\
2487 {\
2488 const int tmpB= tmp[-2*tmpStride];\
2489 const int tmpA= tmp[-1*tmpStride];\
2490 const int tmp0= tmp[0 *tmpStride];\
2491 const int tmp1= tmp[1 *tmpStride];\
2492 const int tmp2= tmp[2 *tmpStride];\
2493 const int tmp3= tmp[3 *tmpStride];\
2494 const int tmp4= tmp[4 *tmpStride];\
2495 const int tmp5= tmp[5 *tmpStride];\
2496 const int tmp6= tmp[6 *tmpStride];\
2497 const int tmp7= tmp[7 *tmpStride];\
2498 const int tmp8= tmp[8 *tmpStride];\
2499 const int tmp9= tmp[9 *tmpStride];\
2500 const int tmp10=tmp[10*tmpStride];\
2501 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2502 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2503 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2504 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2505 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2506 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2507 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2508 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2509 dst++;\
2510 tmp++;\
2511 }\
2512 }\
2513 \
2514 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2515 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2516 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2517 src += 8*srcStride;\
2518 dst += 8*dstStride;\
2519 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2520 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2521 }\
2522 \
2523 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2524 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2525 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2526 src += 8*srcStride;\
2527 dst += 8*dstStride;\
2528 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2529 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2530 }\
2531 \
2532 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2533 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2534 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2535 src += 8*srcStride;\
2536 dst += 8*dstStride;\
2537 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2538 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2539 }\
2540
2541 #define H264_MC(OPNAME, SIZE) \
2542 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2543 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2544 }\
2545 \
2546 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2547 uint8_t half[SIZE*SIZE];\
2548 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2549 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2550 }\
2551 \
2552 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2553 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2554 }\
2555 \
2556 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2557 uint8_t half[SIZE*SIZE];\
2558 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2559 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2560 }\
2561 \
2562 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2563 uint8_t full[SIZE*(SIZE+5)];\
2564 uint8_t * const full_mid= full + SIZE*2;\
2565 uint8_t half[SIZE*SIZE];\
2566 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2567 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2568 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2569 }\
2570 \
2571 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2572 uint8_t full[SIZE*(SIZE+5)];\
2573 uint8_t * const full_mid= full + SIZE*2;\
2574 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2575 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2576 }\
2577 \
2578 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2579 uint8_t full[SIZE*(SIZE+5)];\
2580 uint8_t * const full_mid= full + SIZE*2;\
2581 uint8_t half[SIZE*SIZE];\
2582 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2583 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2584 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2585 }\
2586 \
2587 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2588 uint8_t full[SIZE*(SIZE+5)];\
2589 uint8_t * const full_mid= full + SIZE*2;\
2590 uint8_t halfH[SIZE*SIZE];\
2591 uint8_t halfV[SIZE*SIZE];\
2592 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2593 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2594 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2595 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2596 }\
2597 \
2598 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2599 uint8_t full[SIZE*(SIZE+5)];\
2600 uint8_t * const full_mid= full + SIZE*2;\
2601 uint8_t halfH[SIZE*SIZE];\
2602 uint8_t halfV[SIZE*SIZE];\
2603 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2604 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2605 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2606 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2607 }\
2608 \
2609 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2610 uint8_t full[SIZE*(SIZE+5)];\
2611 uint8_t * const full_mid= full + SIZE*2;\
2612 uint8_t halfH[SIZE*SIZE];\
2613 uint8_t halfV[SIZE*SIZE];\
2614 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2615 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2616 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2617 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2618 }\
2619 \
2620 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2621 uint8_t full[SIZE*(SIZE+5)];\
2622 uint8_t * const full_mid= full + SIZE*2;\
2623 uint8_t halfH[SIZE*SIZE];\
2624 uint8_t halfV[SIZE*SIZE];\
2625 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2626 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2627 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2628 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2629 }\
2630 \
2631 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2632 int16_t tmp[SIZE*(SIZE+5)];\
2633 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2634 }\
2635 \
2636 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2637 int16_t tmp[SIZE*(SIZE+5)];\
2638 uint8_t halfH[SIZE*SIZE];\
2639 uint8_t halfHV[SIZE*SIZE];\
2640 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2641 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2642 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2643 }\
2644 \
2645 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2646 int16_t tmp[SIZE*(SIZE+5)];\
2647 uint8_t halfH[SIZE*SIZE];\
2648 uint8_t halfHV[SIZE*SIZE];\
2649 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2650 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2651 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2652 }\
2653 \
2654 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2655 uint8_t full[SIZE*(SIZE+5)];\
2656 uint8_t * const full_mid= full + SIZE*2;\
2657 int16_t tmp[SIZE*(SIZE+5)];\
2658 uint8_t halfV[SIZE*SIZE];\
2659 uint8_t halfHV[SIZE*SIZE];\
2660 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2661 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2662 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2663 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2664 }\
2665 \
2666 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2667 uint8_t full[SIZE*(SIZE+5)];\
2668 uint8_t * const full_mid= full + SIZE*2;\
2669 int16_t tmp[SIZE*(SIZE+5)];\
2670 uint8_t halfV[SIZE*SIZE];\
2671 uint8_t halfHV[SIZE*SIZE];\
2672 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2673 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2674 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2675 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2676 }\
2677
2678 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2679 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2680 #define op_put(a, b) a = cm[((b) + 16)>>5]
2681 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2682 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2683
2684 H264_LOWPASS(put_ , op_put, op2_put)
2685 H264_LOWPASS(avg_ , op_avg, op2_avg)
2686 H264_MC(put_, 2)
2687 H264_MC(put_, 4)
2688 H264_MC(put_, 8)
2689 H264_MC(put_, 16)
2690 H264_MC(avg_, 4)
2691 H264_MC(avg_, 8)
2692 H264_MC(avg_, 16)
2693
2694 #undef op_avg
2695 #undef op_put
2696 #undef op2_avg
2697 #undef op2_put
2698 #endif
2699
2700 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2701 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2702 #define H264_WEIGHT(W,H) \
2703 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2704 int y; \
2705 offset <<= log2_denom; \
2706 if(log2_denom) offset += 1<<(log2_denom-1); \
2707 for(y=0; y<H; y++, block += stride){ \
2708 op_scale1(0); \
2709 op_scale1(1); \
2710 if(W==2) continue; \
2711 op_scale1(2); \
2712 op_scale1(3); \
2713 if(W==4) continue; \
2714 op_scale1(4); \
2715 op_scale1(5); \
2716 op_scale1(6); \
2717 op_scale1(7); \
2718 if(W==8) continue; \
2719 op_scale1(8); \
2720 op_scale1(9); \
2721 op_scale1(10); \
2722 op_scale1(11); \
2723 op_scale1(12); \
2724 op_scale1(13); \
2725 op_scale1(14); \
2726 op_scale1(15); \
2727 } \
2728 } \
2729 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2730 int y; \
2731 offset = ((offset + 1) | 1) << log2_denom; \
2732 for(y=0; y<H; y++, dst += stride, src += stride){ \
2733 op_scale2(0); \
2734 op_scale2(1); \
2735 if(W==2) continue; \
2736 op_scale2(2); \
2737 op_scale2(3); \
2738 if(W==4) continue; \
2739 op_scale2(4); \
2740 op_scale2(5); \
2741 op_scale2(6); \
2742 op_scale2(7); \
2743 if(W==8) continue; \
2744 op_scale2(8); \
2745 op_scale2(9); \
2746 op_scale2(10); \
2747 op_scale2(11); \
2748 op_scale2(12); \
2749 op_scale2(13); \
2750 op_scale2(14); \
2751 op_scale2(15); \
2752 } \
2753 }
2754
2755 H264_WEIGHT(16,16)
2756 H264_WEIGHT(16,8)
2757 H264_WEIGHT(8,16)
2758 H264_WEIGHT(8,8)
2759 H264_WEIGHT(8,4)
2760 H264_WEIGHT(4,8)
2761 H264_WEIGHT(4,4)
2762 H264_WEIGHT(4,2)
2763 H264_WEIGHT(2,4)
2764 H264_WEIGHT(2,2)
2765
2766 #undef op_scale1
2767 #undef op_scale2
2768 #undef H264_WEIGHT
2769
2770 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2771 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2772 int i;
2773
2774 for(i=0; i<h; i++){
2775 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2776 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2777 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2778 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2779 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2780 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2781 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2782 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2783 dst+=dstStride;
2784 src+=srcStride;
2785 }
2786 }
2787
2788 #if CONFIG_CAVS_DECODER
2789 /* AVS specific */
2790 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2791 put_pixels8_c(dst, src, stride, 8);
2792 }
2793 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2794 avg_pixels8_c(dst, src, stride, 8);
2795 }
2796 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2797 put_pixels16_c(dst, src, stride, 16);
2798 }
2799 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2800 avg_pixels16_c(dst, src, stride, 16);
2801 }
2802 #endif /* CONFIG_CAVS_DECODER */
2803
2804 #if CONFIG_VC1_DECODER
2805 /* VC-1 specific */
2806 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2807 put_pixels8_c(dst, src, stride, 8);
2808 }
2809 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2810 avg_pixels8_c(dst, src, stride, 8);
2811 }
2812 #endif /* CONFIG_VC1_DECODER */
2813
2814 /* H264 specific */
2815 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2816
2817 #if CONFIG_RV40_DECODER
2818 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2819 put_pixels16_xy2_c(dst, src, stride, 16);
2820 }
2821 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2822 avg_pixels16_xy2_c(dst, src, stride, 16);
2823 }
2824 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2825 put_pixels8_xy2_c(dst, src, stride, 8);
2826 }
2827 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2828 avg_pixels8_xy2_c(dst, src, stride, 8);
2829 }
2830 #endif /* CONFIG_RV40_DECODER */
2831
2832 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2833 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2834 int i;
2835
2836 for(i=0; i<w; i++){
2837 const int src_1= src[ -srcStride];
2838 const int src0 = src[0 ];
2839 const int src1 = src[ srcStride];
2840 const int src2 = src[2*srcStride];
2841 const int src3 = src[3*srcStride];
2842 const int src4 = src[4*srcStride];
2843 const int src5 = src[5*srcStride];
2844 const int src6 = src[6*srcStride];
2845 const int src7 = src[7*srcStride];
2846 const int src8 = src[8*srcStride];
2847 const int src9 = src[9*srcStride];
2848 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2849 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2850 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2851 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2852 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2853 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2854 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2855 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2856 src++;
2857 dst++;
2858 }
2859 }
2860
2861 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2862 put_pixels8_c(dst, src, stride, 8);
2863 }
2864
2865 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2866 uint8_t half[64];
2867 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2868 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2869 }
2870
2871 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2872 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2873 }
2874
2875 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2876 uint8_t half[64];
2877 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2878 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2879 }
2880
2881 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2882 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2883 }
2884
2885 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2886 uint8_t halfH[88];
2887 uint8_t halfV[64];
2888 uint8_t halfHV[64];
2889 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2890 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2891 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2892 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2893 }
2894 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2895 uint8_t halfH[88];
2896 uint8_t halfV[64];
2897 uint8_t halfHV[64];
2898 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2899 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2900 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2901 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2902 }
2903 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2904 uint8_t halfH[88];
2905 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2906 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2907 }
2908
2909 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2910 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2911 int x;
2912 const int strength= ff_h263_loop_filter_strength[qscale];
2913
2914 for(x=0; x<8; x++){
2915 int d1, d2, ad1;
2916 int p0= src[x-2*stride];
2917 int p1= src[x-1*stride];
2918 int p2= src[x+0*stride];
2919 int p3= src[x+1*stride];
2920 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2921
2922 if (d<-2*strength) d1= 0;
2923 else if(d<- strength) d1=-2*strength - d;
2924 else if(d< strength) d1= d;
2925 else if(d< 2*strength) d1= 2*strength - d;
2926 else d1= 0;
2927
2928 p1 += d1;
2929 p2 -= d1;
2930 if(p1&256) p1= ~(p1>>31);
2931 if(p2&256) p2= ~(p2>>31);
2932
2933 src[x-1*stride] = p1;
2934 src[x+0*stride] = p2;
2935
2936 ad1= FFABS(d1)>>1;
2937
2938 d2= av_clip((p0-p3)/4, -ad1, ad1);
2939
2940 src[x-2*stride] = p0 - d2;
2941 src[x+ stride] = p3 + d2;
2942 }
2943 }
2944 }
2945
2946 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2947 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2948 int y;
2949 const int strength= ff_h263_loop_filter_strength[qscale];
2950
2951 for(y=0; y<8; y++){
2952 int d1, d2, ad1;
2953 int p0= src[y*stride-2];
2954 int p1= src[y*stride-1];
2955 int p2= src[y*stride+0];
2956 int p3= src[y*stride+1];
2957 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2958
2959 if (d<-2*strength) d1= 0;
2960 else if(d<- strength) d1=-2*strength - d;
2961 else if(d< strength) d1= d;
2962 else if(d< 2*strength) d1= 2*strength - d;
2963 else d1= 0;
2964
2965 p1 += d1;
2966 p2 -= d1;
2967 if(p1&256) p1= ~(p1>>31);
2968 if(p2&256) p2= ~(p2>>31);
2969
2970 src[y*stride-1] = p1;
2971 src[y*stride+0] = p2;
2972
2973 ad1= FFABS(d1)>>1;
2974
2975 d2= av_clip((p0-p3)/4, -ad1, ad1);
2976
2977 src[y*stride-2] = p0 - d2;
2978 src[y*stride+1] = p3 + d2;
2979 }
2980 }
2981 }
2982
2983 static void h261_loop_filter_c(uint8_t *src, int stride){
2984 int x,y,xy,yz;
2985 int temp[64];
2986
2987 for(x=0; x<8; x++){
2988 temp[x ] = 4*src[x ];
2989 temp[x + 7*8] = 4*src[x + 7*stride];
2990 }
2991 for(y=1; y<7; y++){
2992 for(x=0; x<8; x++){
2993 xy = y * stride + x;
2994 yz = y * 8 + x;
2995 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2996 }
2997 }
2998
2999 for(y=0; y<8; y++){
3000 src[ y*stride] = (temp[ y*8] + 2)>>2;
3001 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
3002 for(x=1; x<7; x++){
3003 xy = y * stride + x;
3004 yz = y * 8 + x;
3005 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
3006 }
3007 }
3008 }
3009
3010 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3011 {
3012 int i, d;
3013 for( i = 0; i < 4; i++ ) {
3014 if( tc0[i] < 0 ) {
3015 pix += 4*ystride;
3016 continue;
3017 }
3018 for( d = 0; d < 4; d++ ) {
3019 const int p0 = pix[-1*xstride];
3020 const int p1 = pix[-2*xstride];
3021 const int p2 = pix[-3*xstride];
3022 const int q0 = pix[0];
3023 const int q1 = pix[1*xstride];
3024 const int q2 = pix[2*xstride];
3025
3026 if( FFABS( p0 - q0 ) < alpha &&
3027 FFABS( p1 - p0 ) < beta &&
3028 FFABS( q1 - q0 ) < beta ) {
3029
3030 int tc = tc0[i];
3031 int i_delta;
3032
3033 if( FFABS( p2 - p0 ) < beta ) {
3034 if(tc0[i])
3035 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3036 tc++;
3037 }
3038 if( FFABS( q2 - q0 ) < beta ) {
3039 if(tc0[i])
3040 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3041 tc++;
3042 }
3043
3044 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3045 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
3046 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
3047 }
3048 pix += ystride;
3049 }
3050 }
3051 }
3052 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3053 {
3054 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3055 }
3056 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3057 {
3058 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3059 }
3060
3061 static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3062 {
3063 int d;
3064 for( d = 0; d < 16; d++ ) {
3065 const int p2 = pix[-3*xstride];
3066 const int p1 = pix[-2*xstride];
3067 const int p0 = pix[-1*xstride];
3068
3069 const int q0 = pix[ 0*xstride];
3070 const int q1 = pix[ 1*xstride];
3071 const int q2 = pix[ 2*xstride];
3072
3073 if( FFABS( p0 - q0 ) < alpha &&
3074 FFABS( p1 - p0 ) < beta &&
3075 FFABS( q1 - q0 ) < beta ) {
3076
3077 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3078 if( FFABS( p2 - p0 ) < beta)
3079 {
3080 const int p3 = pix[-4*xstride];
3081 /* p0', p1', p2' */
3082 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3083 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3084 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3085 } else {
3086 /* p0' */
3087 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3088 }
3089 if( FFABS( q2 - q0 ) < beta)
3090 {
3091 const int q3 = pix[3*xstride];
3092 /* q0', q1', q2' */
3093 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3094 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3095 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3096 } else {
3097 /* q0' */
3098 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3099 }
3100 }else{
3101 /* p0', q0' */
3102 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3103 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3104 }
3105 }
3106 pix += ystride;
3107 }
3108 }
3109 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3110 {
3111 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3112 }
3113 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3114 {
3115 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3116 }
3117
3118 static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3119 {
3120 int i, d;
3121 for( i = 0; i < 4; i++ ) {
3122 const int tc = tc0[i];
3123 if( tc <= 0 ) {
3124 pix += 2*ystride;
3125 continue;
3126 }
3127 for( d = 0; d < 2; d++ ) {
3128 const int p0 = pix[-1*xstride];
3129 const int p1 = pix[-2*xstride];
3130 const int q0 = pix[0];
3131 const int q1 = pix[1*xstride];
3132
3133 if( FFABS( p0 - q0 ) < alpha &&
3134 FFABS( p1 - p0 ) < beta &&
3135 FFABS( q1 - q0 ) < beta ) {
3136
3137 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3138
3139 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3140 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3141 }
3142 pix += ystride;
3143 }
3144 }
3145 }
3146 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3147 {
3148 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3149 }
3150 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3151 {
3152 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3153 }
3154
3155 static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3156 {
3157 int d;
3158 for( d = 0; d < 8; d++ ) {
3159 const int p0 = pix[-1*xstride];
3160 const int p1 = pix[-2*xstride];
3161 const int q0 = pix[0];
3162 const int q1 = pix[1*xstride];
3163
3164 if( FFABS( p0 - q0 ) < alpha &&
3165 FFABS( p1 - p0 ) < beta &&
3166 FFABS( q1 - q0 ) < beta ) {
3167
3168 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3169 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3170 }
3171 pix += ystride;
3172 }
3173 }
3174 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3175 {
3176 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3177 }
3178 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3179 {
3180 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3181 }
3182
3183 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3184 {
3185 int s, i;
3186
3187 s = 0;
3188 for(i=0;i<h;i++) {
3189 s += abs(pix1[0] - pix2[0]);
3190 s += abs(pix1[1] - pix2[1]);
3191 s += abs(pix1[2] - pix2[2]);
3192 s += abs(pix1[3] - pix2[3]);
3193 s += abs(pix1[4] - pix2[4]);
3194 s += abs(pix1[5] - pix2[5]);
3195 s += abs(pix1[6] - pix2[6]);
3196 s += abs(pix1[7] - pix2[7]);
3197 s += abs(pix1[8] - pix2[8]);
3198 s += abs(pix1[9] - pix2[9]);
3199 s += abs(pix1[10] - pix2[10]);
3200 s += abs(pix1[11] - pix2[11]);
3201 s += abs(pix1[12] - pix2[12]);
3202 s += abs(pix1[13] - pix2[13]);
3203 s += abs(pix1[14] - pix2[14]);
3204 s += abs(pix1[15] - pix2[15]);
3205 pix1 += line_size;
3206 pix2 += line_size;
3207 }
3208 return s;
3209 }
3210
3211 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3212 {
3213 int s, i;
3214
3215 s = 0;
3216 for(i=0;i<h;i++) {
3217 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3218 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3219 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3220 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3221 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3222 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3223 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3224 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3225 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3226 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3227 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3228 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3229 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3230 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3231 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3232 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3233 pix1 += line_size;
3234 pix2 += line_size;
3235 }
3236 return s;
3237 }
3238
3239 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3240 {
3241 int s, i;
3242 uint8_t *pix3 = pix2 + line_size;
3243
3244 s = 0;
3245 for(i=0;i<h;i++) {
3246 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3247 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3248 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3249 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3250 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3251 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3252 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3253 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3254 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3255 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3256 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3257 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3258 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3259 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3260 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3261 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3262 pix1 += line_size;
3263 pix2 += line_size;
3264 pix3 += line_size;
3265 }
3266 return s;
3267 }
3268
3269 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3270 {
3271 int s, i;
3272 uint8_t *pix3 = pix2 + line_size;
3273
3274 s = 0;
3275 for(i=0;i<h;i++) {
3276 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3277 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3278 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3279 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3280 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3281 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3282 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3283 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3284 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3285 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3286 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3287 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3288 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3289 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3290 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3291 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3292 pix1 += line_size;
3293 pix2 += line_size;
3294 pix3 += line_size;
3295 }
3296 return s;
3297 }
3298
3299 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3300 {
3301 int s, i;
3302
3303 s = 0;
3304 for(i=0;i<h;i++) {
3305 s += abs(pix1[0] - pix2[0]);
3306 s += abs(pix1[1] - pix2[1]);
3307 s += abs(pix1[2] - pix2[2]);
3308 s += abs(pix1[3] - pix2[3]);
3309 s += abs(pix1[4] - pix2[4]);
3310 s += abs(pix1[5] - pix2[5]);
3311 s += abs(pix1[6] - pix2[6]);
3312 s += abs(pix1[7] - pix2[7]);
3313 pix1 += line_size;
3314 pix2 += line_size;
3315 }
3316 return s;
3317 }
3318
3319 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3320 {
3321 int s, i;
3322
3323 s = 0;
3324 for(i=0;i<h;i++) {
3325 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3326 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3327 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3328 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3329 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3330 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3331 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3332 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3333 pix1 += line_size;
3334 pix2 += line_size;
3335 }
3336 return s;
3337 }
3338
3339 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3340 {
3341 int s, i;
3342 uint8_t *pix3 = pix2 + line_size;
3343
3344 s = 0;
3345 for(i=0;i<h;i++) {
3346 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3347 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3348 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3349 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3350 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3351 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3352 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3353 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3354 pix1 += line_size;
3355 pix2 += line_size;
3356 pix3 += line_size;
3357 }
3358 return s;
3359 }
3360
3361 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3362 {
3363 int s, i;
3364 uint8_t *pix3 = pix2 + line_size;
3365
3366 s = 0;
3367 for(i=0;i<h;i++) {
3368 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3369 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3370 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3371 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3372 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3373 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3374 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3375 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3376 pix1 += line_size;
3377 pix2 += line_size;
3378 pix3 += line_size;
3379 }
3380 return s;
3381 }
3382
3383 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3384 MpegEncContext *c = v;
3385 int score1=0;
3386 int score2=0;
3387 int x,y;
3388
3389 for(y=0; y<h; y++){
3390 for(x=0; x<16; x++){
3391 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3392 }
3393 if(y+1<h){
3394 for(x=0; x<15; x++){
3395 score2+= FFABS( s1[x ] - s1[x +stride]
3396 - s1[x+1] + s1[x+1+stride])
3397 -FFABS( s2[x ] - s2[x +stride]
3398 - s2[x+1] + s2[x+1+stride]);
3399 }
3400 }
3401 s1+= stride;
3402 s2+= stride;
3403 }
3404
3405 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3406 else return score1 + FFABS(score2)*8;
3407 }
3408
3409 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3410 MpegEncContext *c = v;
3411 int score1=0;
3412 int score2=0;
3413 int x,y;
3414
3415 for(y=0; y<h; y++){
3416 for(x=0; x<8; x++){
3417 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3418 }
3419 if(y+1<h){
3420 for(x=0; x<7; x++){
3421 score2+= FFABS( s1[x ] - s1[x +stride]
3422 - s1[x+1] + s1[x+1+stride])
3423 -FFABS( s2[x ] - s2[x +stride]
3424 - s2[x+1] + s2[x+1+stride]);
3425 }
3426 }
3427 s1+= stride;
3428 s2+= stride;
3429 }
3430
3431 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3432 else return score1 + FFABS(score2)*8;
3433 }
3434
3435 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3436 int i;
3437 unsigned int sum=0;
3438
3439 for(i=0; i<8*8; i++){
3440 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3441 int w= weight[i];
3442 b>>= RECON_SHIFT;
3443 assert(-512<b && b<512);
3444
3445 sum += (w*b)*(w*b)>>4;
3446 }
3447 return sum>>2;
3448 }
3449
3450 static void add_8x8basis_c(int16_t rem[