4afe743b7197cb6453833769f80160ebcfbd118e
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
32 #include "faandct.h"
33
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35 uint32_t squareTbl[512];
36
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46 };
47
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59 };
60
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64];
63
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73 };
74
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84 };
85
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120 };
121
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 };
133
134 static int pix_sum_c(uint8_t * pix, int line_size)
135 {
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154 }
155
156 static int pix_norm1_c(uint8_t * pix, int line_size)
157 {
158 int s, i, j;
159 uint32_t *sq = squareTbl + 256;
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
164 #if 0
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
173 #else
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184 #else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195 #endif
196 #endif
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202 }
203
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220 }
221
222 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223 {
224 int s, i;
225 uint32_t *sq = squareTbl + 256;
226
227 s = 0;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 s += sq[pix1[4] - pix2[4]];
234 s += sq[pix1[5] - pix2[5]];
235 s += sq[pix1[6] - pix2[6]];
236 s += sq[pix1[7] - pix2[7]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241 }
242
243 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
244 {
245 int s, i;
246 uint32_t *sq = squareTbl + 256;
247
248 s = 0;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[ 0] - pix2[ 0]];
251 s += sq[pix1[ 1] - pix2[ 1]];
252 s += sq[pix1[ 2] - pix2[ 2]];
253 s += sq[pix1[ 3] - pix2[ 3]];
254 s += sq[pix1[ 4] - pix2[ 4]];
255 s += sq[pix1[ 5] - pix2[ 5]];
256 s += sq[pix1[ 6] - pix2[ 6]];
257 s += sq[pix1[ 7] - pix2[ 7]];
258 s += sq[pix1[ 8] - pix2[ 8]];
259 s += sq[pix1[ 9] - pix2[ 9]];
260 s += sq[pix1[10] - pix2[10]];
261 s += sq[pix1[11] - pix2[11]];
262 s += sq[pix1[12] - pix2[12]];
263 s += sq[pix1[13] - pix2[13]];
264 s += sq[pix1[14] - pix2[14]];
265 s += sq[pix1[15] - pix2[15]];
266
267 pix1 += line_size;
268 pix2 += line_size;
269 }
270 return s;
271 }
272
273 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
274 {
275 int i;
276
277 /* read the pixels */
278 for(i=0;i<8;i++) {
279 block[0] = pixels[0];
280 block[1] = pixels[1];
281 block[2] = pixels[2];
282 block[3] = pixels[3];
283 block[4] = pixels[4];
284 block[5] = pixels[5];
285 block[6] = pixels[6];
286 block[7] = pixels[7];
287 pixels += line_size;
288 block += 8;
289 }
290 }
291
292 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293 const uint8_t *s2, int stride){
294 int i;
295
296 /* read the pixels */
297 for(i=0;i<8;i++) {
298 block[0] = s1[0] - s2[0];
299 block[1] = s1[1] - s2[1];
300 block[2] = s1[2] - s2[2];
301 block[3] = s1[3] - s2[3];
302 block[4] = s1[4] - s2[4];
303 block[5] = s1[5] - s2[5];
304 block[6] = s1[6] - s2[6];
305 block[7] = s1[7] - s2[7];
306 s1 += stride;
307 s2 += stride;
308 block += 8;
309 }
310 }
311
312
313 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
314 int line_size)
315 {
316 int i;
317 uint8_t *cm = cropTbl + MAX_NEG_CROP;
318
319 /* read the pixels */
320 for(i=0;i<8;i++) {
321 pixels[0] = cm[block[0]];
322 pixels[1] = cm[block[1]];
323 pixels[2] = cm[block[2]];
324 pixels[3] = cm[block[3]];
325 pixels[4] = cm[block[4]];
326 pixels[5] = cm[block[5]];
327 pixels[6] = cm[block[6]];
328 pixels[7] = cm[block[7]];
329
330 pixels += line_size;
331 block += 8;
332 }
333 }
334
335 static void put_signed_pixels_clamped_c(const DCTELEM *block,
336 uint8_t *restrict pixels,
337 int line_size)
338 {
339 int i, j;
340
341 for (i = 0; i < 8; i++) {
342 for (j = 0; j < 8; j++) {
343 if (*block < -128)
344 *pixels = 0;
345 else if (*block > 127)
346 *pixels = 255;
347 else
348 *pixels = (uint8_t)(*block + 128);
349 block++;
350 pixels++;
351 }
352 pixels += (line_size - 8);
353 }
354 }
355
356 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
357 int line_size)
358 {
359 int i;
360 uint8_t *cm = cropTbl + MAX_NEG_CROP;
361
362 /* read the pixels */
363 for(i=0;i<8;i++) {
364 pixels[0] = cm[pixels[0] + block[0]];
365 pixels[1] = cm[pixels[1] + block[1]];
366 pixels[2] = cm[pixels[2] + block[2]];
367 pixels[3] = cm[pixels[3] + block[3]];
368 pixels[4] = cm[pixels[4] + block[4]];
369 pixels[5] = cm[pixels[5] + block[5]];
370 pixels[6] = cm[pixels[6] + block[6]];
371 pixels[7] = cm[pixels[7] + block[7]];
372 pixels += line_size;
373 block += 8;
374 }
375 }
376 #if 0
377
378 #define PIXOP2(OPNAME, OP) \
379 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
380 {\
381 int i;\
382 for(i=0; i<h; i++){\
383 OP(*((uint64_t*)block), LD64(pixels));\
384 pixels+=line_size;\
385 block +=line_size;\
386 }\
387 }\
388 \
389 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
390 {\
391 int i;\
392 for(i=0; i<h; i++){\
393 const uint64_t a= LD64(pixels );\
394 const uint64_t b= LD64(pixels+1);\
395 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
396 pixels+=line_size;\
397 block +=line_size;\
398 }\
399 }\
400 \
401 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
402 {\
403 int i;\
404 for(i=0; i<h; i++){\
405 const uint64_t a= LD64(pixels );\
406 const uint64_t b= LD64(pixels+1);\
407 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
408 pixels+=line_size;\
409 block +=line_size;\
410 }\
411 }\
412 \
413 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
414 {\
415 int i;\
416 for(i=0; i<h; i++){\
417 const uint64_t a= LD64(pixels );\
418 const uint64_t b= LD64(pixels+line_size);\
419 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
420 pixels+=line_size;\
421 block +=line_size;\
422 }\
423 }\
424 \
425 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
426 {\
427 int i;\
428 for(i=0; i<h; i++){\
429 const uint64_t a= LD64(pixels );\
430 const uint64_t b= LD64(pixels+line_size);\
431 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
432 pixels+=line_size;\
433 block +=line_size;\
434 }\
435 }\
436 \
437 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
438 {\
439 int i;\
440 const uint64_t a= LD64(pixels );\
441 const uint64_t b= LD64(pixels+1);\
442 uint64_t l0= (a&0x0303030303030303ULL)\
443 + (b&0x0303030303030303ULL)\
444 + 0x0202020202020202ULL;\
445 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
446 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
447 uint64_t l1,h1;\
448 \
449 pixels+=line_size;\
450 for(i=0; i<h; i+=2){\
451 uint64_t a= LD64(pixels );\
452 uint64_t b= LD64(pixels+1);\
453 l1= (a&0x0303030303030303ULL)\
454 + (b&0x0303030303030303ULL);\
455 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
456 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
457 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
458 pixels+=line_size;\
459 block +=line_size;\
460 a= LD64(pixels );\
461 b= LD64(pixels+1);\
462 l0= (a&0x0303030303030303ULL)\
463 + (b&0x0303030303030303ULL)\
464 + 0x0202020202020202ULL;\
465 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
466 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
467 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
468 pixels+=line_size;\
469 block +=line_size;\
470 }\
471 }\
472 \
473 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
474 {\
475 int i;\
476 const uint64_t a= LD64(pixels );\
477 const uint64_t b= LD64(pixels+1);\
478 uint64_t l0= (a&0x0303030303030303ULL)\
479 + (b&0x0303030303030303ULL)\
480 + 0x0101010101010101ULL;\
481 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
482 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
483 uint64_t l1,h1;\
484 \
485 pixels+=line_size;\
486 for(i=0; i<h; i+=2){\
487 uint64_t a= LD64(pixels );\
488 uint64_t b= LD64(pixels+1);\
489 l1= (a&0x0303030303030303ULL)\
490 + (b&0x0303030303030303ULL);\
491 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
492 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
493 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
494 pixels+=line_size;\
495 block +=line_size;\
496 a= LD64(pixels );\
497 b= LD64(pixels+1);\
498 l0= (a&0x0303030303030303ULL)\
499 + (b&0x0303030303030303ULL)\
500 + 0x0101010101010101ULL;\
501 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
502 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
503 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
504 pixels+=line_size;\
505 block +=line_size;\
506 }\
507 }\
508 \
509 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
510 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
511 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
512 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
513 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
514 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
515 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
516
517 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
518 #else // 64 bit variant
519
520 #define PIXOP2(OPNAME, OP) \
521 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
522 int i;\
523 for(i=0; i<h; i++){\
524 OP(*((uint16_t*)(block )), LD16(pixels ));\
525 pixels+=line_size;\
526 block +=line_size;\
527 }\
528 }\
529 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
530 int i;\
531 for(i=0; i<h; i++){\
532 OP(*((uint32_t*)(block )), LD32(pixels ));\
533 pixels+=line_size;\
534 block +=line_size;\
535 }\
536 }\
537 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
538 int i;\
539 for(i=0; i<h; i++){\
540 OP(*((uint32_t*)(block )), LD32(pixels ));\
541 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
542 pixels+=line_size;\
543 block +=line_size;\
544 }\
545 }\
546 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
547 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
548 }\
549 \
550 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
551 int src_stride1, int src_stride2, int h){\
552 int i;\
553 for(i=0; i<h; i++){\
554 uint32_t a,b;\
555 a= LD32(&src1[i*src_stride1 ]);\
556 b= LD32(&src2[i*src_stride2 ]);\
557 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
558 a= LD32(&src1[i*src_stride1+4]);\
559 b= LD32(&src2[i*src_stride2+4]);\
560 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
561 }\
562 }\
563 \
564 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
565 int src_stride1, int src_stride2, int h){\
566 int i;\
567 for(i=0; i<h; i++){\
568 uint32_t a,b;\
569 a= LD32(&src1[i*src_stride1 ]);\
570 b= LD32(&src2[i*src_stride2 ]);\
571 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
572 a= LD32(&src1[i*src_stride1+4]);\
573 b= LD32(&src2[i*src_stride2+4]);\
574 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
575 }\
576 }\
577 \
578 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579 int src_stride1, int src_stride2, int h){\
580 int i;\
581 for(i=0; i<h; i++){\
582 uint32_t a,b;\
583 a= LD32(&src1[i*src_stride1 ]);\
584 b= LD32(&src2[i*src_stride2 ]);\
585 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
586 }\
587 }\
588 \
589 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
590 int src_stride1, int src_stride2, int h){\
591 int i;\
592 for(i=0; i<h; i++){\
593 uint32_t a,b;\
594 a= LD16(&src1[i*src_stride1 ]);\
595 b= LD16(&src2[i*src_stride2 ]);\
596 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
597 }\
598 }\
599 \
600 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
601 int src_stride1, int src_stride2, int h){\
602 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
603 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
604 }\
605 \
606 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
607 int src_stride1, int src_stride2, int h){\
608 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
609 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
610 }\
611 \
612 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
613 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
614 }\
615 \
616 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
617 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
618 }\
619 \
620 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
621 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
622 }\
623 \
624 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
625 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
626 }\
627 \
628 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
629 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
630 int i;\
631 for(i=0; i<h; i++){\
632 uint32_t a, b, c, d, l0, l1, h0, h1;\
633 a= LD32(&src1[i*src_stride1]);\
634 b= LD32(&src2[i*src_stride2]);\
635 c= LD32(&src3[i*src_stride3]);\
636 d= LD32(&src4[i*src_stride4]);\
637 l0= (a&0x03030303UL)\
638 + (b&0x03030303UL)\
639 + 0x02020202UL;\
640 h0= ((a&0xFCFCFCFCUL)>>2)\
641 + ((b&0xFCFCFCFCUL)>>2);\
642 l1= (c&0x03030303UL)\
643 + (d&0x03030303UL);\
644 h1= ((c&0xFCFCFCFCUL)>>2)\
645 + ((d&0xFCFCFCFCUL)>>2);\
646 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
647 a= LD32(&src1[i*src_stride1+4]);\
648 b= LD32(&src2[i*src_stride2+4]);\
649 c= LD32(&src3[i*src_stride3+4]);\
650 d= LD32(&src4[i*src_stride4+4]);\
651 l0= (a&0x03030303UL)\
652 + (b&0x03030303UL)\
653 + 0x02020202UL;\
654 h0= ((a&0xFCFCFCFCUL)>>2)\
655 + ((b&0xFCFCFCFCUL)>>2);\
656 l1= (c&0x03030303UL)\
657 + (d&0x03030303UL);\
658 h1= ((c&0xFCFCFCFCUL)>>2)\
659 + ((d&0xFCFCFCFCUL)>>2);\
660 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
661 }\
662 }\
663 \
664 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
665 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
666 }\
667 \
668 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
669 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
670 }\
671 \
672 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
673 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
674 }\
675 \
676 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
677 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
678 }\
679 \
680 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
682 int i;\
683 for(i=0; i<h; i++){\
684 uint32_t a, b, c, d, l0, l1, h0, h1;\
685 a= LD32(&src1[i*src_stride1]);\
686 b= LD32(&src2[i*src_stride2]);\
687 c= LD32(&src3[i*src_stride3]);\
688 d= LD32(&src4[i*src_stride4]);\
689 l0= (a&0x03030303UL)\
690 + (b&0x03030303UL)\
691 + 0x01010101UL;\
692 h0= ((a&0xFCFCFCFCUL)>>2)\
693 + ((b&0xFCFCFCFCUL)>>2);\
694 l1= (c&0x03030303UL)\
695 + (d&0x03030303UL);\
696 h1= ((c&0xFCFCFCFCUL)>>2)\
697 + ((d&0xFCFCFCFCUL)>>2);\
698 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
699 a= LD32(&src1[i*src_stride1+4]);\
700 b= LD32(&src2[i*src_stride2+4]);\
701 c= LD32(&src3[i*src_stride3+4]);\
702 d= LD32(&src4[i*src_stride4+4]);\
703 l0= (a&0x03030303UL)\
704 + (b&0x03030303UL)\
705 + 0x01010101UL;\
706 h0= ((a&0xFCFCFCFCUL)>>2)\
707 + ((b&0xFCFCFCFCUL)>>2);\
708 l1= (c&0x03030303UL)\
709 + (d&0x03030303UL);\
710 h1= ((c&0xFCFCFCFCUL)>>2)\
711 + ((d&0xFCFCFCFCUL)>>2);\
712 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
713 }\
714 }\
715 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
716 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
717 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
718 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
719 }\
720 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
721 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
722 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
723 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
724 }\
725 \
726 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
727 {\
728 int i, a0, b0, a1, b1;\
729 a0= pixels[0];\
730 b0= pixels[1] + 2;\
731 a0 += b0;\
732 b0 += pixels[2];\
733 \
734 pixels+=line_size;\
735 for(i=0; i<h; i+=2){\
736 a1= pixels[0];\
737 b1= pixels[1];\
738 a1 += b1;\
739 b1 += pixels[2];\
740 \
741 block[0]= (a1+a0)>>2; /* FIXME non put */\
742 block[1]= (b1+b0)>>2;\
743 \
744 pixels+=line_size;\
745 block +=line_size;\
746 \
747 a0= pixels[0];\
748 b0= pixels[1] + 2;\
749 a0 += b0;\
750 b0 += pixels[2];\
751 \
752 block[0]= (a1+a0)>>2;\
753 block[1]= (b1+b0)>>2;\
754 pixels+=line_size;\
755 block +=line_size;\
756 }\
757 }\
758 \
759 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
760 {\
761 int i;\
762 const uint32_t a= LD32(pixels );\
763 const uint32_t b= LD32(pixels+1);\
764 uint32_t l0= (a&0x03030303UL)\
765 + (b&0x03030303UL)\
766 + 0x02020202UL;\
767 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
768 + ((b&0xFCFCFCFCUL)>>2);\
769 uint32_t l1,h1;\
770 \
771 pixels+=line_size;\
772 for(i=0; i<h; i+=2){\
773 uint32_t a= LD32(pixels );\
774 uint32_t b= LD32(pixels+1);\
775 l1= (a&0x03030303UL)\
776 + (b&0x03030303UL);\
777 h1= ((a&0xFCFCFCFCUL)>>2)\
778 + ((b&0xFCFCFCFCUL)>>2);\
779 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
780 pixels+=line_size;\
781 block +=line_size;\
782 a= LD32(pixels );\
783 b= LD32(pixels+1);\
784 l0= (a&0x03030303UL)\
785 + (b&0x03030303UL)\
786 + 0x02020202UL;\
787 h0= ((a&0xFCFCFCFCUL)>>2)\
788 + ((b&0xFCFCFCFCUL)>>2);\
789 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
790 pixels+=line_size;\
791 block +=line_size;\
792 }\
793 }\
794 \
795 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
796 {\
797 int j;\
798 for(j=0; j<2; j++){\
799 int i;\
800 const uint32_t a= LD32(pixels );\
801 const uint32_t b= LD32(pixels+1);\
802 uint32_t l0= (a&0x03030303UL)\
803 + (b&0x03030303UL)\
804 + 0x02020202UL;\
805 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
806 + ((b&0xFCFCFCFCUL)>>2);\
807 uint32_t l1,h1;\
808 \
809 pixels+=line_size;\
810 for(i=0; i<h; i+=2){\
811 uint32_t a= LD32(pixels );\
812 uint32_t b= LD32(pixels+1);\
813 l1= (a&0x03030303UL)\
814 + (b&0x03030303UL);\
815 h1= ((a&0xFCFCFCFCUL)>>2)\
816 + ((b&0xFCFCFCFCUL)>>2);\
817 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
818 pixels+=line_size;\
819 block +=line_size;\
820 a= LD32(pixels );\
821 b= LD32(pixels+1);\
822 l0= (a&0x03030303UL)\
823 + (b&0x03030303UL)\
824 + 0x02020202UL;\
825 h0= ((a&0xFCFCFCFCUL)>>2)\
826 + ((b&0xFCFCFCFCUL)>>2);\
827 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
828 pixels+=line_size;\
829 block +=line_size;\
830 }\
831 pixels+=4-line_size*(h+1);\
832 block +=4-line_size*h;\
833 }\
834 }\
835 \
836 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
837 {\
838 int j;\
839 for(j=0; j<2; j++){\
840 int i;\
841 const uint32_t a= LD32(pixels );\
842 const uint32_t b= LD32(pixels+1);\
843 uint32_t l0= (a&0x03030303UL)\
844 + (b&0x03030303UL)\
845 + 0x01010101UL;\
846 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
847 + ((b&0xFCFCFCFCUL)>>2);\
848 uint32_t l1,h1;\
849 \
850 pixels+=line_size;\
851 for(i=0; i<h; i+=2){\
852 uint32_t a= LD32(pixels );\
853 uint32_t b= LD32(pixels+1);\
854 l1= (a&0x03030303UL)\
855 + (b&0x03030303UL);\
856 h1= ((a&0xFCFCFCFCUL)>>2)\
857 + ((b&0xFCFCFCFCUL)>>2);\
858 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
859 pixels+=line_size;\
860 block +=line_size;\
861 a= LD32(pixels );\
862 b= LD32(pixels+1);\
863 l0= (a&0x03030303UL)\
864 + (b&0x03030303UL)\
865 + 0x01010101UL;\
866 h0= ((a&0xFCFCFCFCUL)>>2)\
867 + ((b&0xFCFCFCFCUL)>>2);\
868 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
869 pixels+=line_size;\
870 block +=line_size;\
871 }\
872 pixels+=4-line_size*(h+1);\
873 block +=4-line_size*h;\
874 }\
875 }\
876 \
877 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
878 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
879 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
880 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
881 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
883 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
885
886 #define op_avg(a, b) a = rnd_avg32(a, b)
887 #endif
888 #define op_put(a, b) a = b
889
890 PIXOP2(avg, op_avg)
891 PIXOP2(put, op_put)
892 #undef op_avg
893 #undef op_put
894
895 #define avg2(a,b) ((a+b+1)>>1)
896 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
897
898 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
899 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
900 }
901
902 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
903 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
904 }
905
906 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
907 {
908 const int A=(16-x16)*(16-y16);
909 const int B=( x16)*(16-y16);
910 const int C=(16-x16)*( y16);
911 const int D=( x16)*( y16);
912 int i;
913
914 for(i=0; i<h; i++)
915 {
916 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
917 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
918 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
919 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
920 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
921 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
922 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
923 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
924 dst+= stride;
925 src+= stride;
926 }
927 }
928
929 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
930 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
931 {
932 int y, vx, vy;
933 const int s= 1<<shift;
934
935 width--;
936 height--;
937
938 for(y=0; y<h; y++){
939 int x;
940
941 vx= ox;
942 vy= oy;
943 for(x=0; x<8; x++){ //XXX FIXME optimize
944 int src_x, src_y, frac_x, frac_y, index;
945
946 src_x= vx>>16;
947 src_y= vy>>16;
948 frac_x= src_x&(s-1);
949 frac_y= src_y&(s-1);
950 src_x>>=shift;
951 src_y>>=shift;
952
953 if((unsigned)src_x < width){
954 if((unsigned)src_y < height){
955 index= src_x + src_y*stride;
956 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
957 + src[index +1]* frac_x )*(s-frac_y)
958 + ( src[index+stride ]*(s-frac_x)
959 + src[index+stride+1]* frac_x )* frac_y
960 + r)>>(shift*2);
961 }else{
962 index= src_x + clip(src_y, 0, height)*stride;
963 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
964 + src[index +1]* frac_x )*s
965 + r)>>(shift*2);
966 }
967 }else{
968 if((unsigned)src_y < height){
969 index= clip(src_x, 0, width) + src_y*stride;
970 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
971 + src[index+stride ]* frac_y )*s
972 + r)>>(shift*2);
973 }else{
974 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
975 dst[y*stride + x]= src[index ];
976 }
977 }
978
979 vx+= dxx;
980 vy+= dyx;
981 }
982 ox += dxy;
983 oy += dyy;
984 }
985 }
986
987 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
988 switch(width){
989 case 2: put_pixels2_c (dst, src, stride, height); break;
990 case 4: put_pixels4_c (dst, src, stride, height); break;
991 case 8: put_pixels8_c (dst, src, stride, height); break;
992 case 16:put_pixels16_c(dst, src, stride, height); break;
993 }
994 }
995
996 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
997 int i,j;
998 for (i=0; i < height; i++) {
999 for (j=0; j < width; j++) {
1000 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1001 }
1002 src += stride;
1003 dst += stride;
1004 }
1005 }
1006
1007 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1008 int i,j;
1009 for (i=0; i < height; i++) {
1010 for (j=0; j < width; j++) {
1011 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1012 }
1013 src += stride;
1014 dst += stride;
1015 }
1016 }
1017
1018 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1019 int i,j;
1020 for (i=0; i < height; i++) {
1021 for (j=0; j < width; j++) {
1022 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1023 }
1024 src += stride;
1025 dst += stride;
1026 }
1027 }
1028
1029 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1030 int i,j;
1031 for (i=0; i < height; i++) {
1032 for (j=0; j < width; j++) {
1033 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1034 }
1035 src += stride;
1036 dst += stride;
1037 }
1038 }
1039
1040 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1041 int i,j;
1042 for (i=0; i < height; i++) {
1043 for (j=0; j < width; j++) {
1044 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1045 }
1046 src += stride;
1047 dst += stride;
1048 }
1049 }
1050
1051 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1052 int i,j;
1053 for (i=0; i < height; i++) {
1054 for (j=0; j < width; j++) {
1055 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1056 }
1057 src += stride;
1058 dst += stride;
1059 }
1060 }
1061
1062 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1063 int i,j;
1064 for (i=0; i < height; i++) {
1065 for (j=0; j < width; j++) {
1066 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1067 }
1068 src += stride;
1069 dst += stride;
1070 }
1071 }
1072
1073 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1074 int i,j;
1075 for (i=0; i < height; i++) {
1076 for (j=0; j < width; j++) {
1077 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1078 }
1079 src += stride;
1080 dst += stride;
1081 }
1082 }
1083
1084 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1085 switch(width){
1086 case 2: avg_pixels2_c (dst, src, stride, height); break;
1087 case 4: avg_pixels4_c (dst, src, stride, height); break;
1088 case 8: avg_pixels8_c (dst, src, stride, height); break;
1089 case 16:avg_pixels16_c(dst, src, stride, height); break;
1090 }
1091 }
1092
1093 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1094 int i,j;
1095 for (i=0; i < height; i++) {
1096 for (j=0; j < width; j++) {
1097 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1098 }
1099 src += stride;
1100 dst += stride;
1101 }
1102 }
1103
1104 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1105 int i,j;
1106 for (i=0; i < height; i++) {
1107 for (j=0; j < width; j++) {
1108 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1109 }
1110 src += stride;
1111 dst += stride;
1112 }
1113 }
1114
1115 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1116 int i,j;
1117 for (i=0; i < height; i++) {
1118 for (j=0; j < width; j++) {
1119 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1120 }
1121 src += stride;
1122 dst += stride;
1123 }
1124 }
1125
1126 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1127 int i,j;
1128 for (i=0; i < height; i++) {
1129 for (j=0; j < width; j++) {
1130 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1131 }
1132 src += stride;
1133 dst += stride;
1134 }
1135 }
1136
1137 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1138 int i,j;
1139 for (i=0; i < height; i++) {
1140 for (j=0; j < width; j++) {
1141 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1142 }
1143 src += stride;
1144 dst += stride;
1145 }
1146 }
1147
1148 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1149 int i,j;
1150 for (i=0; i < height; i++) {
1151 for (j=0; j < width; j++) {
1152 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1153 }
1154 src += stride;
1155 dst += stride;
1156 }
1157 }
1158
1159 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1160 int i,j;
1161 for (i=0; i < height; i++) {
1162 for (j=0; j < width; j++) {
1163 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1164 }
1165 src += stride;
1166 dst += stride;
1167 }
1168 }
1169
1170 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1171 int i,j;
1172 for (i=0; i < height; i++) {
1173 for (j=0; j < width; j++) {
1174 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1175 }
1176 src += stride;
1177 dst += stride;
1178 }
1179 }
1180 #if 0
1181 #define TPEL_WIDTH(width)\
1182 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1183 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1184 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1185 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1186 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1187 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1188 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1189 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1190 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1191 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1192 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1193 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1194 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1195 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1196 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1197 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1198 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1199 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1200 #endif
1201
1202 #define H264_CHROMA_MC(OPNAME, OP)\
1203 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1204 const int A=(8-x)*(8-y);\
1205 const int B=( x)*(8-y);\
1206 const int C=(8-x)*( y);\
1207 const int D=( x)*( y);\
1208 int i;\
1209 \
1210 assert(x<8 && y<8 && x>=0 && y>=0);\
1211 \
1212 for(i=0; i<h; i++)\
1213 {\
1214 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1215 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1216 dst+= stride;\
1217 src+= stride;\
1218 }\
1219 }\
1220 \
1221 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1222 const int A=(8-x)*(8-y);\
1223 const int B=( x)*(8-y);\
1224 const int C=(8-x)*( y);\
1225 const int D=( x)*( y);\
1226 int i;\
1227 \
1228 assert(x<8 && y<8 && x>=0 && y>=0);\
1229 \
1230 for(i=0; i<h; i++)\
1231 {\
1232 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1233 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1234 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1235 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1236 dst+= stride;\
1237 src+= stride;\
1238 }\
1239 }\
1240 \
1241 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1242 const int A=(8-x)*(8-y);\
1243 const int B=( x)*(8-y);\
1244 const int C=(8-x)*( y);\
1245 const int D=( x)*( y);\
1246 int i;\
1247 \
1248 assert(x<8 && y<8 && x>=0 && y>=0);\
1249 \
1250 for(i=0; i<h; i++)\
1251 {\
1252 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1253 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1254 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1255 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1256 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1257 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1258 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1259 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1260 dst+= stride;\
1261 src+= stride;\
1262 }\
1263 }
1264
1265 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1266 #define op_put(a, b) a = (((b) + 32)>>6)
1267
1268 H264_CHROMA_MC(put_ , op_put)
1269 H264_CHROMA_MC(avg_ , op_avg)
1270 #undef op_avg
1271 #undef op_put
1272
1273 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1274 {
1275 int i;
1276 for(i=0; i<h; i++)
1277 {
1278 ST32(dst , LD32(src ));
1279 dst+=dstStride;
1280 src+=srcStride;
1281 }
1282 }
1283
1284 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1285 {
1286 int i;
1287 for(i=0; i<h; i++)
1288 {
1289 ST32(dst , LD32(src ));
1290 ST32(dst+4 , LD32(src+4 ));
1291 dst+=dstStride;
1292 src+=srcStride;
1293 }
1294 }
1295
1296 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297 {
1298 int i;
1299 for(i=0; i<h; i++)
1300 {
1301 ST32(dst , LD32(src ));
1302 ST32(dst+4 , LD32(src+4 ));
1303 ST32(dst+8 , LD32(src+8 ));
1304 ST32(dst+12, LD32(src+12));
1305 dst+=dstStride;
1306 src+=srcStride;
1307 }
1308 }
1309
1310 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1311 {
1312 int i;
1313 for(i=0; i<h; i++)
1314 {
1315 ST32(dst , LD32(src ));
1316 ST32(dst+4 , LD32(src+4 ));
1317 ST32(dst+8 , LD32(src+8 ));
1318 ST32(dst+12, LD32(src+12));
1319 dst[16]= src[16];
1320 dst+=dstStride;
1321 src+=srcStride;
1322 }
1323 }
1324
1325 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1326 {
1327 int i;
1328 for(i=0; i<h; i++)
1329 {
1330 ST32(dst , LD32(src ));
1331 ST32(dst+4 , LD32(src+4 ));
1332 dst[8]= src[8];
1333 dst+=dstStride;
1334 src+=srcStride;
1335 }
1336 }
1337
1338
1339 #define QPEL_MC(r, OPNAME, RND, OP) \
1340 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1341 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1342 int i;\
1343 for(i=0; i<h; i++)\
1344 {\
1345 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1346 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1347 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1348 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1349 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1350 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1351 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1352 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1353 dst+=dstStride;\
1354 src+=srcStride;\
1355 }\
1356 }\
1357 \
1358 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1359 const int w=8;\
1360 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1361 int i;\
1362 for(i=0; i<w; i++)\
1363 {\
1364 const int src0= src[0*srcStride];\
1365 const int src1= src[1*srcStride];\
1366 const int src2= src[2*srcStride];\
1367 const int src3= src[3*srcStride];\
1368 const int src4= src[4*srcStride];\
1369 const int src5= src[5*srcStride];\
1370 const int src6= src[6*srcStride];\
1371 const int src7= src[7*srcStride];\
1372 const int src8= src[8*srcStride];\
1373 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1374 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1375 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1376 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1377 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1378 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1379 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1380 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1381 dst++;\
1382 src++;\
1383 }\
1384 }\
1385 \
1386 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1387 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1388 int i;\
1389 \
1390 for(i=0; i<h; i++)\
1391 {\
1392 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1393 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1394 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1395 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1396 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1397 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1398 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1399 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1400 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1401 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1402 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1403 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1404 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1405 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1406 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1407 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1408 dst+=dstStride;\
1409 src+=srcStride;\
1410 }\
1411 }\
1412 \
1413 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1414 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1415 int i;\
1416 const int w=16;\
1417 for(i=0; i<w; i++)\
1418 {\
1419 const int src0= src[0*srcStride];\
1420 const int src1= src[1*srcStride];\
1421 const int src2= src[2*srcStride];\
1422 const int src3= src[3*srcStride];\
1423 const int src4= src[4*srcStride];\
1424 const int src5= src[5*srcStride];\
1425 const int src6= src[6*srcStride];\
1426 const int src7= src[7*srcStride];\
1427 const int src8= src[8*srcStride];\
1428 const int src9= src[9*srcStride];\
1429 const int src10= src[10*srcStride];\
1430 const int src11= src[11*srcStride];\
1431 const int src12= src[12*srcStride];\
1432 const int src13= src[13*srcStride];\
1433 const int src14= src[14*srcStride];\
1434 const int src15= src[15*srcStride];\
1435 const int src16= src[16*srcStride];\
1436 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1437 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1438 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1439 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1440 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1441 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1442 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1443 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1444 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1445 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1446 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1447 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1448 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1449 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1450 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1451 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1452 dst++;\
1453 src++;\
1454 }\
1455 }\
1456 \
1457 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1458 OPNAME ## pixels8_c(dst, src, stride, 8);\
1459 }\
1460 \
1461 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1462 uint8_t half[64];\
1463 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1464 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1465 }\
1466 \
1467 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1468 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1469 }\
1470 \
1471 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1472 uint8_t half[64];\
1473 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1474 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1475 }\
1476 \
1477 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1478 uint8_t full[16*9];\
1479 uint8_t half[64];\
1480 copy_block9(full, src, 16, stride, 9);\
1481 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1482 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1483 }\
1484 \
1485 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1486 uint8_t full[16*9];\
1487 copy_block9(full, src, 16, stride, 9);\
1488 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1489 }\
1490 \
1491 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1492 uint8_t full[16*9];\
1493 uint8_t half[64];\
1494 copy_block9(full, src, 16, stride, 9);\
1495 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1496 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1497 }\
1498 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[16*9];\
1500 uint8_t halfH[72];\
1501 uint8_t halfV[64];\
1502 uint8_t halfHV[64];\
1503 copy_block9(full, src, 16, stride, 9);\
1504 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1508 }\
1509 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1510 uint8_t full[16*9];\
1511 uint8_t halfH[72];\
1512 uint8_t halfHV[64];\
1513 copy_block9(full, src, 16, stride, 9);\
1514 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1518 }\
1519 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520 uint8_t full[16*9];\
1521 uint8_t halfH[72];\
1522 uint8_t halfV[64];\
1523 uint8_t halfHV[64];\
1524 copy_block9(full, src, 16, stride, 9);\
1525 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1526 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1529 }\
1530 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1531 uint8_t full[16*9];\
1532 uint8_t halfH[72];\
1533 uint8_t halfHV[64];\
1534 copy_block9(full, src, 16, stride, 9);\
1535 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1539 }\
1540 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541 uint8_t full[16*9];\
1542 uint8_t halfH[72];\
1543 uint8_t halfV[64];\
1544 uint8_t halfHV[64];\
1545 copy_block9(full, src, 16, stride, 9);\
1546 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1547 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1548 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1549 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1550 }\
1551 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1552 uint8_t full[16*9];\
1553 uint8_t halfH[72];\
1554 uint8_t halfHV[64];\
1555 copy_block9(full, src, 16, stride, 9);\
1556 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1558 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1560 }\
1561 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1562 uint8_t full[16*9];\
1563 uint8_t halfH[72];\
1564 uint8_t halfV[64];\
1565 uint8_t halfHV[64];\
1566 copy_block9(full, src, 16, stride, 9);\
1567 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1568 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1569 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1570 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1571 }\
1572 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1573 uint8_t full[16*9];\
1574 uint8_t halfH[72];\
1575 uint8_t halfHV[64];\
1576 copy_block9(full, src, 16, stride, 9);\
1577 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1578 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1579 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1580 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1581 }\
1582 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1583 uint8_t halfH[72];\
1584 uint8_t halfHV[64];\
1585 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1586 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1587 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1588 }\
1589 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1590 uint8_t halfH[72];\
1591 uint8_t halfHV[64];\
1592 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1593 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1594 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1595 }\
1596 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1597 uint8_t full[16*9];\
1598 uint8_t halfH[72];\
1599 uint8_t halfV[64];\
1600 uint8_t halfHV[64];\
1601 copy_block9(full, src, 16, stride, 9);\
1602 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1603 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1604 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1605 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1606 }\
1607 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1608 uint8_t full[16*9];\
1609 uint8_t halfH[72];\
1610 copy_block9(full, src, 16, stride, 9);\
1611 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1612 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1613 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1614 }\
1615 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1616 uint8_t full[16*9];\
1617 uint8_t halfH[72];\
1618 uint8_t halfV[64];\
1619 uint8_t halfHV[64];\
1620 copy_block9(full, src, 16, stride, 9);\
1621 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1622 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1623 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1624 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1625 }\
1626 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1627 uint8_t full[16*9];\
1628 uint8_t halfH[72];\
1629 copy_block9(full, src, 16, stride, 9);\
1630 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1631 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1632 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1633 }\
1634 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1635 uint8_t halfH[72];\
1636 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1637 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1638 }\
1639 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1640 OPNAME ## pixels16_c(dst, src, stride, 16);\
1641 }\
1642 \
1643 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1644 uint8_t half[256];\
1645 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1646 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1647 }\
1648 \
1649 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1650 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1651 }\
1652 \
1653 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1654 uint8_t half[256];\
1655 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1656 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1657 }\
1658 \
1659 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1660 uint8_t full[24*17];\
1661 uint8_t half[256];\
1662 copy_block17(full, src, 24, stride, 17);\
1663 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1664 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1665 }\
1666 \
1667 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1668 uint8_t full[24*17];\
1669 copy_block17(full, src, 24, stride, 17);\
1670 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1671 }\
1672 \
1673 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1674 uint8_t full[24*17];\
1675 uint8_t half[256];\
1676 copy_block17(full, src, 24, stride, 17);\
1677 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1678 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1679 }\
1680 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[24*17];\
1682 uint8_t halfH[272];\
1683 uint8_t halfV[256];\
1684 uint8_t halfHV[256];\
1685 copy_block17(full, src, 24, stride, 17);\
1686 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690 }\
1691 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1692 uint8_t full[24*17];\
1693 uint8_t halfH[272];\
1694 uint8_t halfHV[256];\
1695 copy_block17(full, src, 24, stride, 17);\
1696 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1700 }\
1701 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702 uint8_t full[24*17];\
1703 uint8_t halfH[272];\
1704 uint8_t halfV[256];\
1705 uint8_t halfHV[256];\
1706 copy_block17(full, src, 24, stride, 17);\
1707 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1708 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711 }\
1712 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[24*17];\
1714 uint8_t halfH[272];\
1715 uint8_t halfHV[256];\
1716 copy_block17(full, src, 24, stride, 17);\
1717 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1721 }\
1722 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723 uint8_t full[24*17];\
1724 uint8_t halfH[272];\
1725 uint8_t halfV[256];\
1726 uint8_t halfHV[256];\
1727 copy_block17(full, src, 24, stride, 17);\
1728 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1729 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1730 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1731 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1732 }\
1733 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1734 uint8_t full[24*17];\
1735 uint8_t halfH[272];\
1736 uint8_t halfHV[256];\
1737 copy_block17(full, src, 24, stride, 17);\
1738 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1740 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1742 }\
1743 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t full[24*17];\
1745 uint8_t halfH[272];\
1746 uint8_t halfV[256];\
1747 uint8_t halfHV[256];\
1748 copy_block17(full, src, 24, stride, 17);\
1749 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1750 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1751 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1752 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1753 }\
1754 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1755 uint8_t full[24*17];\
1756 uint8_t halfH[272];\
1757 uint8_t halfHV[256];\
1758 copy_block17(full, src, 24, stride, 17);\
1759 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1760 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1761 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1762 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1763 }\
1764 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1765 uint8_t halfH[272];\
1766 uint8_t halfHV[256];\
1767 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1768 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1769 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1770 }\
1771 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1772 uint8_t halfH[272];\
1773 uint8_t halfHV[256];\
1774 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1775 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1776 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1777 }\
1778 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t full[24*17];\
1780 uint8_t halfH[272];\
1781 uint8_t halfV[256];\
1782 uint8_t halfHV[256];\
1783 copy_block17(full, src, 24, stride, 17);\
1784 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1785 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1786 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1787 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1788 }\
1789 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1790 uint8_t full[24*17];\
1791 uint8_t halfH[272];\
1792 copy_block17(full, src, 24, stride, 17);\
1793 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1794 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1795 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1796 }\
1797 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1798 uint8_t full[24*17];\
1799 uint8_t halfH[272];\
1800 uint8_t halfV[256];\
1801 uint8_t halfHV[256];\
1802 copy_block17(full, src, 24, stride, 17);\
1803 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1804 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1805 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1806 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1807 }\
1808 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1809 uint8_t full[24*17];\
1810 uint8_t halfH[272];\
1811 copy_block17(full, src, 24, stride, 17);\
1812 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1813 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1814 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1815 }\
1816 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1817 uint8_t halfH[272];\
1818 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1819 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1820 }
1821
1822 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1823 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1824 #define op_put(a, b) a = cm[((b) + 16)>>5]
1825 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1826
1827 QPEL_MC(0, put_ , _ , op_put)
1828 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1829 QPEL_MC(0, avg_ , _ , op_avg)
1830 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1831 #undef op_avg
1832 #undef op_avg_no_rnd
1833 #undef op_put
1834 #undef op_put_no_rnd
1835
1836 #if 1
1837 #define H264_LOWPASS(OPNAME, OP, OP2) \
1838 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1839 const int h=4;\
1840 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1841 int i;\
1842 for(i=0; i<h; i++)\
1843 {\
1844 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1845 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1846 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1847 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1848 dst+=dstStride;\
1849 src+=srcStride;\
1850 }\
1851 }\
1852 \
1853 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1854 const int w=4;\
1855 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1856 int i;\
1857 for(i=0; i<w; i++)\
1858 {\
1859 const int srcB= src[-2*srcStride];\
1860 const int srcA= src[-1*srcStride];\
1861 const int src0= src[0 *srcStride];\
1862 const int src1= src[1 *srcStride];\
1863 const int src2= src[2 *srcStride];\
1864 const int src3= src[3 *srcStride];\
1865 const int src4= src[4 *srcStride];\
1866 const int src5= src[5 *srcStride];\
1867 const int src6= src[6 *srcStride];\
1868 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1869 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1870 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1871 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1872 dst++;\
1873 src++;\
1874 }\
1875 }\
1876 \
1877 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1878 const int h=4;\
1879 const int w=4;\
1880 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1881 int i;\
1882 src -= 2*srcStride;\
1883 for(i=0; i<h+5; i++)\
1884 {\
1885 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1886 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1887 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1888 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1889 tmp+=tmpStride;\
1890 src+=srcStride;\
1891 }\
1892 tmp -= tmpStride*(h+5-2);\
1893 for(i=0; i<w; i++)\
1894 {\
1895 const int tmpB= tmp[-2*tmpStride];\
1896 const int tmpA= tmp[-1*tmpStride];\
1897 const int tmp0= tmp[0 *tmpStride];\
1898 const int tmp1= tmp[1 *tmpStride];\
1899 const int tmp2= tmp[2 *tmpStride];\
1900 const int tmp3= tmp[3 *tmpStride];\
1901 const int tmp4= tmp[4 *tmpStride];\
1902 const int tmp5= tmp[5 *tmpStride];\
1903 const int tmp6= tmp[6 *tmpStride];\
1904 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1905 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1906 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1907 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1908 dst++;\
1909 tmp++;\
1910 }\
1911 }\
1912 \
1913 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1914 const int h=8;\
1915 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1916 int i;\
1917 for(i=0; i<h; i++)\
1918 {\
1919 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1920 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1921 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1922 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1923 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1924 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1925 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1926 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1927 dst+=dstStride;\
1928 src+=srcStride;\
1929 }\
1930 }\
1931 \
1932 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1933 const int w=8;\
1934 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1935 int i;\
1936 for(i=0; i<w; i++)\
1937 {\
1938 const int srcB= src[-2*srcStride];\
1939 const int srcA= src[-1*srcStride];\
1940 const int src0= src[0 *srcStride];\
1941 const int src1= src[1 *srcStride];\
1942 const int src2= src[2 *srcStride];\
1943 const int src3= src[3 *srcStride];\
1944 const int src4= src[4 *srcStride];\
1945 const int src5= src[5 *srcStride];\
1946 const int src6= src[6 *srcStride];\
1947 const int src7= src[7 *srcStride];\
1948 const int src8= src[8 *srcStride];\
1949 const int src9= src[9 *srcStride];\
1950 const int src10=src[10*srcStride];\
1951 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1952 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1953 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1954 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1955 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1956 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1957 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1958 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1959 dst++;\
1960 src++;\
1961 }\
1962 }\
1963 \
1964 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1965 const int h=8;\
1966 const int w=8;\
1967 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1968 int i;\
1969 src -= 2*srcStride;\
1970 for(i=0; i<h+5; i++)\
1971 {\
1972 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1973 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1974 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1975 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1976 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1977 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1978 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1979 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1980 tmp+=tmpStride;\
1981 src+=srcStride;\
1982 }\
1983 tmp -= tmpStride*(h+5-2);\
1984 for(i=0; i<w; i++)\
1985 {\
1986 const int tmpB= tmp[-2*tmpStride];\
1987 const int tmpA= tmp[-1*tmpStride];\
1988 const int tmp0= tmp[0 *tmpStride];\
1989 const int tmp1= tmp[1 *tmpStride];\
1990 const int tmp2= tmp[2 *tmpStride];\
1991 const int tmp3= tmp[3 *tmpStride];\
1992 const int tmp4= tmp[4 *tmpStride];\
1993 const int tmp5= tmp[5 *tmpStride];\
1994 const int tmp6= tmp[6 *tmpStride];\
1995 const int tmp7= tmp[7 *tmpStride];\
1996 const int tmp8= tmp[8 *tmpStride];\
1997 const int tmp9= tmp[9 *tmpStride];\
1998 const int tmp10=tmp[10*tmpStride];\
1999 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2000 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2001 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2002 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2003 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2004 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2005 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2006 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2007 dst++;\
2008 tmp++;\
2009 }\
2010 }\
2011 \
2012 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2013 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2014 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2015 src += 8*srcStride;\
2016 dst += 8*dstStride;\
2017 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2018 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2019 }\
2020 \
2021 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2022 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2023 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2024 src += 8*srcStride;\
2025 dst += 8*dstStride;\
2026 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2027 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2028 }\
2029 \
2030 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2031 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2032 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2033 src += 8*srcStride;\
2034 tmp += 8*tmpStride;\
2035 dst += 8*dstStride;\
2036 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2037 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2038 }\
2039
2040 #define H264_MC(OPNAME, SIZE) \
2041 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2042 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2043 }\
2044 \
2045 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2046 uint8_t half[SIZE*SIZE];\
2047 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2048 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2049 }\
2050 \
2051 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2052 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2053 }\
2054 \
2055 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2056 uint8_t half[SIZE*SIZE];\
2057 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2058 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2059 }\
2060 \
2061 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[SIZE*(SIZE+5)];\
2063 uint8_t * const full_mid= full + SIZE*2;\
2064 uint8_t half[SIZE*SIZE];\
2065 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2066 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2067 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2068 }\
2069 \
2070 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2071 uint8_t full[SIZE*(SIZE+5)];\
2072 uint8_t * const full_mid= full + SIZE*2;\
2073 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2074 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2075 }\
2076 \
2077 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2078 uint8_t full[SIZE*(SIZE+5)];\
2079 uint8_t * const full_mid= full + SIZE*2;\
2080 uint8_t half[SIZE*SIZE];\
2081 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2082 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2083 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2084 }\
2085 \
2086 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2087 uint8_t full[SIZE*(SIZE+5)];\
2088 uint8_t * const full_mid= full + SIZE*2;\
2089 uint8_t halfH[SIZE*SIZE];\
2090 uint8_t halfV[SIZE*SIZE];\
2091 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2092 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2093 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2094 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2095 }\
2096 \
2097 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2098 uint8_t full[SIZE*(SIZE+5)];\
2099 uint8_t * const full_mid= full + SIZE*2;\
2100 uint8_t halfH[SIZE*SIZE];\
2101 uint8_t halfV[SIZE*SIZE];\
2102 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2103 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2104 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2105 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2106 }\
2107 \
2108 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2109 uint8_t full[SIZE*(SIZE+5)];\
2110 uint8_t * const full_mid= full + SIZE*2;\
2111 uint8_t halfH[SIZE*SIZE];\
2112 uint8_t halfV[SIZE*SIZE];\
2113 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2114 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2115 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2116 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2117 }\
2118 \
2119 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2120 uint8_t full[SIZE*(SIZE+5)];\
2121 uint8_t * const full_mid= full + SIZE*2;\
2122 uint8_t halfH[SIZE*SIZE];\
2123 uint8_t halfV[SIZE*SIZE];\
2124 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2125 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2126 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2127 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2128 }\
2129 \
2130 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2131 int16_t tmp[SIZE*(SIZE+5)];\
2132 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2133 }\
2134 \
2135 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2136 int16_t tmp[SIZE*(SIZE+5)];\
2137 uint8_t halfH[SIZE*SIZE];\
2138 uint8_t halfHV[SIZE*SIZE];\
2139 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2140 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2142 }\
2143 \
2144 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2145 int16_t tmp[SIZE*(SIZE+5)];\
2146 uint8_t halfH[SIZE*SIZE];\
2147 uint8_t halfHV[SIZE*SIZE];\
2148 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2149 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2150 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2151 }\
2152 \
2153 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2154 uint8_t full[SIZE*(SIZE+5)];\
2155 uint8_t * const full_mid= full + SIZE*2;\
2156 int16_t tmp[SIZE*(SIZE+5)];\
2157 uint8_t halfV[SIZE*SIZE];\
2158 uint8_t halfHV[SIZE*SIZE];\
2159 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2160 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2161 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2162 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2163 }\
2164 \
2165 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2166 uint8_t full[SIZE*(SIZE+5)];\
2167 uint8_t * const full_mid= full + SIZE*2;\
2168 int16_t tmp[SIZE*(SIZE+5)];\
2169 uint8_t halfV[SIZE*SIZE];\
2170 uint8_t halfHV[SIZE*SIZE];\
2171 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2172 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2173 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2174 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2175 }\
2176
2177 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2178 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2179 #define op_put(a, b) a = cm[((b) + 16)>>5]
2180 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2181 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2182
2183 H264_LOWPASS(put_ , op_put, op2_put)
2184 H264_LOWPASS(avg_ , op_avg, op2_avg)
2185 H264_MC(put_, 4)
2186 H264_MC(put_, 8)
2187 H264_MC(put_, 16)
2188 H264_MC(avg_, 4)
2189 H264_MC(avg_, 8)
2190 H264_MC(avg_, 16)
2191
2192 #undef op_avg
2193 #undef op_put
2194 #undef op2_avg
2195 #undef op2_put
2196 #endif
2197
2198 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2199 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2200 int i;
2201
2202 for(i=0; i<h; i++){
2203 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2204 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2205 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2206 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2207 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2208 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2209 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2210 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2211 dst+=dstStride;
2212 src+=srcStride;
2213 }
2214 }
2215
2216 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2217 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2218 int i;
2219
2220 for(i=0; i<w; i++){
2221 const int src_1= src[ -srcStride];
2222 const int src0 = src[0 ];
2223 const int src1 = src[ srcStride];
2224 const int src2 = src[2*srcStride];
2225 const int src3 = src[3*srcStride];
2226 const int src4 = src[4*srcStride];
2227 const int src5 = src[5*srcStride];
2228 const int src6 = src[6*srcStride];
2229 const int src7 = src[7*srcStride];
2230 const int src8 = src[8*srcStride];
2231 const int src9 = src[9*srcStride];
2232 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2233 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2234 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2235 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2236 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2237 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2238 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2239 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2240 src++;
2241 dst++;
2242 }
2243 }
2244
2245 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2246 put_pixels8_c(dst, src, stride, 8);
2247 }
2248
2249 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2250 uint8_t half[64];
2251 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2252 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2253 }
2254
2255 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2256 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2257 }
2258
2259 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2260 uint8_t half[64];
2261 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2262 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2263 }
2264
2265 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2266 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2267 }
2268
2269 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2270 uint8_t halfH[88];
2271 uint8_t halfV[64];
2272 uint8_t halfHV[64];
2273 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2274 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2275 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2276 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2277 }
2278 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2279 uint8_t halfH[88];
2280 uint8_t halfV[64];
2281 uint8_t halfHV[64];
2282 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2283 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2284 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2285 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2286 }
2287 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2288 uint8_t halfH[88];
2289 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2290 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2291 }
2292
2293 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2294 int x;
2295 const int strength= ff_h263_loop_filter_strength[qscale];
2296
2297 for(x=0; x<8; x++){
2298 int d1, d2, ad1;
2299 int p0= src[x-2*stride];
2300 int p1= src[x-1*stride];
2301 int p2= src[x+0*stride];
2302 int p3= src[x+1*stride];
2303 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2304
2305 if (d<-2*strength) d1= 0;
2306 else if(d<- strength) d1=-2*strength - d;
2307 else if(d< strength) d1= d;
2308 else if(d< 2*strength) d1= 2*strength - d;
2309 else d1= 0;
2310
2311 p1 += d1;
2312 p2 -= d1;
2313 if(p1&256) p1= ~(p1>>31);
2314 if(p2&256) p2= ~(p2>>31);
2315
2316 src[x-1*stride] = p1;
2317 src[x+0*stride] = p2;
2318
2319 ad1= ABS(d1)>>1;
2320
2321 d2= clip((p0-p3)/4, -ad1, ad1);
2322
2323 src[x-2*stride] = p0 - d2;
2324 src[x+ stride] = p3 + d2;
2325 }
2326 }
2327
2328 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2329 int y;
2330 const int strength= ff_h263_loop_filter_strength[qscale];
2331
2332 for(y=0; y<8; y++){
2333 int d1, d2, ad1;
2334 int p0= src[y*stride-2];
2335 int p1= src[y*stride-1];
2336 int p2= src[y*stride+0];
2337 int p3= src[y*stride+1];
2338 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2339
2340 if (d<-2*strength) d1= 0;
2341 else if(d<- strength) d1=-2*strength - d;
2342 else if(d< strength) d1= d;
2343 else if(d< 2*strength) d1= 2*strength - d;
2344 else d1= 0;
2345
2346 p1 += d1;
2347 p2 -= d1;
2348 if(p1&256) p1= ~(p1>>31);
2349 if(p2&256) p2= ~(p2>>31);
2350
2351 src[y*stride-1] = p1;
2352 src[y*stride+0] = p2;
2353
2354 ad1= ABS(d1)>>1;
2355
2356 d2= clip((p0-p3)/4, -ad1, ad1);
2357
2358 src[y*stride-2] = p0 - d2;
2359 src[y*stride+1] = p3 + d2;
2360 }
2361 }
2362
2363 static void h261_loop_filter_c(uint8_t *src, int stride){
2364 int x,y,xy,yz;
2365 int temp[64];
2366
2367 for(x=0; x<8; x++){
2368 temp[x ] = 4*src[x ];
2369 temp[x + 7*8] = 4*src[x + 7*stride];
2370 }
2371 for(y=1; y<7; y++){
2372 for(x=0; x<8; x++){
2373 xy = y * stride + x;
2374 yz = y * 8 + x;
2375 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2376 }
2377 }
2378
2379 for(y=0; y<8; y++){
2380 src[ y*stride] = (temp[ y*8] + 2)>>2;
2381 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2382 for(x=1; x<7; x++){
2383 xy = y * stride + x;
2384 yz = y * 8 + x;
2385 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2386 }
2387 }
2388 }
2389
2390 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2391 {
2392 int s, i;
2393
2394 s = 0;
2395 for(i=0;i<h;i++) {
2396 s += abs(pix1[0] - pix2[0]);
2397 s += abs(pix1[1] - pix2[1]);
2398 s += abs(pix1[2] - pix2[2]);
2399 s += abs(pix1[3] - pix2[3]);
2400 s += abs(pix1[4] - pix2[4]);
2401 s += abs(pix1[5] - pix2[5]);
2402 s += abs(pix1[6] - pix2[6]);
2403 s += abs(pix1[7] - pix2[7]);
2404 s += abs(pix1[8] - pix2[8]);
2405 s += abs(pix1[9] - pix2[9]);
2406 s += abs(pix1[10] - pix2[10]);
2407 s += abs(pix1[11] - pix2[11]);
2408 s += abs(pix1[12] - pix2[12]);
2409 s += abs(pix1[13] - pix2[13]);
2410 s += abs(pix1[14] - pix2[14]);
2411 s += abs(pix1[15] - pix2[15]);
2412 pix1 += line_size;
2413 pix2 += line_size;
2414 }
2415 return s;
2416 }
2417
2418 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2419 {
2420 int s, i;
2421
2422 s = 0;
2423 for(i=0;i<h;i++) {
2424 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2425 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2426 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2427 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2428 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2429 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2430 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2431 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2432 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2433 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2434 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2435 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2436 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2437 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2438 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2439 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2440 pix1 += line_size;
2441 pix2 += line_size;
2442 }
2443 return s;
2444 }
2445
2446 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2447 {
2448 int s, i;
2449 uint8_t *pix3 = pix2 + line_size;
2450
2451 s = 0;
2452 for(i=0;i<h;i++) {
2453 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2454 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2455 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2456 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2457 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2458 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2459 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2460 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2461 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2462 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2463 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2464 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2465 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2466 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2467 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2468 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2469 pix1 += line_size;
2470 pix2 += line_size;
2471 pix3 += line_size;
2472 }
2473 return s;
2474 }
2475
2476 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2477 {
2478 int s, i;
2479 uint8_t *pix3 = pix2 + line_size;
2480
2481 s = 0;
2482 for(i=0;i<h;i++) {
2483 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2484 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2485 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2486 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2487 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2488 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2489 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2490 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2491 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2492 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2493 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2494 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2495 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2496 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2497 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2498 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2499 pix1 += line_size;
2500 pix2 += line_size;
2501 pix3 += line_size;
2502 }
2503 return s;
2504 }
2505
2506 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2507 {
2508 int s, i;
2509
2510 s = 0;
2511 for(i=0;i<h;i++) {
2512 s += abs(pix1[0] - pix2[0]);
2513 s += abs(pix1[1] - pix2[1]);
2514 s += abs(pix1[2] - pix2[2]);
2515 s += abs(pix1[3] - pix2[3]);
2516 s += abs(pix1[4] - pix2[4]);
2517 s += abs(pix1[5] - pix2[5]);
2518 s += abs(pix1[6] - pix2[6]);
2519 s += abs(pix1[7] - pix2[7]);
2520 pix1 += line_size;
2521 pix2 += line_size;
2522 }
2523 return s;
2524 }
2525
2526 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2527 {
2528 int s, i;
2529
2530 s = 0;
2531 for(i=0;i<h;i++) {
2532 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2533 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2534 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2535 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2536 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2537 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2538 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2539 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2540 pix1 += line_size;
2541 pix2 += line_size;
2542 }
2543 return s;
2544 }
2545
2546 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2547 {
2548 int s, i;
2549 uint8_t *pix3 = pix2 + line_size;
2550
2551 s = 0;
2552 for(i=0;i<h;i++) {
2553 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2554 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2555 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2556 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2557 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2558 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2559 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2560 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2561 pix1 += line_size;
2562 pix2 += line_size;
2563 pix3 += line_size;
2564 }
2565 return s;
2566 }
2567
2568 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2569 {
2570 int s, i;
2571 uint8_t *pix3 = pix2 + line_size;
2572
2573 s = 0;
2574 for(i=0;i<h;i++) {
2575 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2576 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2577 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2578 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2579 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2580 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2581 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2582 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2583 pix1 += line_size;
2584 pix2 += line_size;
2585 pix3 += line_size;
2586 }
2587 return s;
2588 }
2589
2590 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2591 int score1=0;
2592 int score2=0;
2593 int x,y;
2594
2595 for(y=0; y<h; y++){
2596 for(x=0; x<16; x++){
2597 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2598 }
2599 if(y+1<h){
2600 for(x=0; x<15; x++){
2601 score2+= ABS( s1[x ] - s1[x +stride]
2602 - s1[x+1] + s1[x+1+stride])
2603 -ABS( s2[x ] - s2[x +stride]
2604 - s2[x+1] + s2[x+1+stride]);
2605 }
2606 }
2607 s1+= stride;
2608 s2+= stride;
2609 }
2610
2611 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2612 else return score1 + ABS(score2)*8;
2613 }
2614
2615 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2616 int score1=0;
2617 int score2=0;
2618 int x,y;
2619
2620 for(y=0; y<h; y++){
2621 for(x=0; x<8; x++){
2622 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2623 }
2624 if(y+1<h){
2625 for(x=0; x<7; x++){
2626 score2+= ABS( s1[x ] - s1[x +stride]
2627 - s1[x+1] + s1[x+1+stride])
2628 -ABS( s2[x ] - s2[x +stride]
2629 - s2[x+1] + s2[x+1+stride]);
2630 }
2631 }
2632 s1+= stride;
2633 s2+= stride;
2634 }
2635
2636 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2637 else return score1 + ABS(score2)*8;
2638 }
2639
2640 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2641 int i;
2642 unsigned int sum=0;
2643
2644 for(i=0; i<8*8; i++){
2645 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2646 int w= weight[i];
2647 b>>= RECON_SHIFT;
2648 assert(-512<b && b<512);
2649
2650 sum += (w*b)*(w*b)>>4;
2651 }
2652 return sum>>2;
2653 }
2654
2655 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2656 int i;
2657
2658 for(i=0; i<8*8; i++){
2659 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2660 }
2661 }
2662
2663 /**
2664 * permutes an 8x8 block.
2665 * @param block the block which will be permuted according to the given permutation vector
2666 * @param permutation the permutation vector
2667 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2668 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2669 * (inverse) permutated to scantable order!
2670 */
2671 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2672 {
2673 int i;
2674 DCTELEM temp[64];
2675
2676 if(last<=0) return;
2677 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2678
2679 for(i=0; i<=last; i++){
2680 const int j= scantable[i];
2681 temp[j]= block[j];
2682 block[j]=0;
2683 }
2684
2685 for(i=0; i<=last; i++){
2686 const int j= scantable[i];
2687 const int perm_j= permutation[j];
2688 block[perm_j]= temp[j];
2689 }
2690 }
2691
2692 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2693 return 0;
2694 }
2695
2696 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2697 int i;
2698
2699 memset(cmp, 0, sizeof(void*)*5);
2700
2701 for(i=0; i<5; i++){
2702 switch(type&0xFF){
2703 case FF_CMP_SAD:
2704 cmp[i]= c->sad[i];
2705 break;
2706 case FF_CMP_SATD:
2707 cmp[i]= c->hadamard8_diff[i];
2708 break;
2709 case FF_CMP_SSE:
2710 cmp[i]= c->sse[i];
2711 break;
2712 case FF_CMP_DCT:
2713 cmp[i]= c->dct_sad[i];
2714 break;
2715 case FF_CMP_PSNR:
2716 cmp[i]= c->quant_psnr[i];
2717 break;
2718 case FF_CMP_BIT:
2719 cmp[i]= c->bit[i];
2720 break;
2721 case FF_CMP_RD:
2722 cmp[i]= c->rd[i];
2723 break;
2724 case FF_CMP_VSAD:
2725 cmp[i]= c->vsad[i];
2726 break;
2727 case FF_CMP_VSSE:
2728 cmp[i]= c->vsse[i];
2729 break;
2730 case FF_CMP_ZERO:
2731 cmp[i]= zero_cmp;
2732 break;
2733 case FF_CMP_NSSE:
2734 cmp[i]= c->nsse[i];
2735 break;
2736 default:
2737 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2738 }
2739 }
2740 }
2741
2742 /**
2743 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2744 */
2745 static void clear_blocks_c(DCTELEM *blocks)
2746 {
2747 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2748 }
2749
2750 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2751 int i;
2752 for(i=0; i+7<w; i+=8){
2753 dst[i+0] += src[i+0];
2754 dst[i+1] += src[i+1];
2755 dst[i+2] += src[i+2];
2756 dst[i+3] += src[i+3];
2757 dst[i+4] += src[i+4];
2758 dst[i+5] += src[i+5];
2759 dst[i+6] += src[i+6];
2760 dst[i+7] += src[i+7];
2761 }
2762 for(; i<w; i++)
2763 dst[i+0] += src[i+0];
2764 }
2765
2766 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2767 int i;
2768 for(i=0; i+7<w; i+=8){
2769 dst[i+0] = src1[i+0]-src2[i+0];
2770 dst[i+1] = src1[i+1]-src2[i+1];
2771 dst[i+2] = src1[i+2]-src2[i+2];
2772 dst[i+3] = src1[i+3]-src2[i+3];
2773 dst[i+4] = src1[i+4]-src2[i+4];
2774 dst[i+5] = src1[i+5]-src2[i+5];
2775 dst[i+6] = src1[i+6]-src2[i+6];
2776 dst[i+7] = src1[i+7]-src2[i+7];
2777 }
2778 for(; i<w; i++)
2779 dst[i+0] = src1[i+0]-src2[i+0];
2780 }
2781
2782 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2783 int i;
2784 uint8_t l, lt;
2785
2786 l= *left;
2787 lt= *left_top;
2788
2789 for(i=0; i<w; i++){
2790 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2791 lt= src1[i];
2792 l= src2[i];
2793 dst[i]= l - pred;
2794 }
2795
2796 *left= l;
2797 *left_top= lt;
2798 }
2799
2800 #define BUTTERFLY2(o1,o2,i1,i2) \
2801 o1= (i1)+(i2);\
2802 o2= (i1)-(i2);
2803
2804 #define BUTTERFLY1(x,y) \
2805 {\
2806 int a,b;\
2807 a= x;\
2808 b= y;\
2809 x= a+b;\
2810 y= a-b;\
2811 }
2812
2813 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2814
2815 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2816 int i;
2817 int temp[64];
2818 int sum=0;
2819
2820 assert(h==8);
2821
2822 for(i=0; i<8; i++){
2823 //FIXME try pointer walks
2824 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2825 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2826 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2827 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2828
2829 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2830 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2831 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2832 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2833
2834 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2835 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2836 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2837 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2838 }
2839
2840 for(i=0; i<8; i++){
2841 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2842 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2843 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2844 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2845
2846 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2847 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2848 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2849 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2850
2851 sum +=
2852 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2853 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2854 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2855 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2856 }
2857 #if 0
2858 static int maxi=0;
2859 if(sum>maxi){
2860 maxi=sum;
2861 printf("MAX:%d\n", maxi);
2862 }
2863 #endif
2864 return sum;
2865 }
2866
2867 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2868 int i;
2869 int temp[64];
2870 int sum=0;
2871
2872 assert(h==8);
2873
2874 for(i=0; i<8; i++){
2875 //FIXME try pointer walks
2876 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2877 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2878 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2879 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2880
2881 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2882 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2883 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2884 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2885
2886 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2887 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2888 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2889 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2890 }
2891
2892 for(i=0; i<8; i++){
2893 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2894 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2895 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2896 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2897
2898 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2899 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2900 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2901 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2902
2903 sum +=
2904 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2905 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2906 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2907 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2908 }
2909
2910 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2911
2912 return sum;
2913 }
2914
2915 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2916 MpegEncContext * const s= (MpegEncContext *)c;
2917 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2918 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2919 int sum=0, i;
2920
2921 assert(h==8);
2922
2923 s->dsp.diff_pixels(temp, src1, src2, stride);
2924 s->dsp.fdct(temp);
2925
2926 for(i=0; i<64; i++)
2927 sum+= ABS(temp[i]);
2928
2929 return sum;
2930 }
2931
2932 void simple_idct(DCTELEM *block); //FIXME
2933
2934 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2935 MpegEncContext * const s= (MpegEncContext *)c;
2936 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2937 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2938 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2939 int sum=0, i;
2940
2941 assert(h==8);
2942 s->mb_intra=0;
2943
2944 s->dsp.diff_pixels(temp, src1, src2, stride);
2945
2946 memcpy(bak, temp, 64*sizeof(DCTELEM));
2947
2948 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2949 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2950 simple_idct(temp); //FIXME
2951
2952 for(i=0; i<64; i++)
2953 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2954
2955 return sum;
2956 }
2957
2958 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2959 MpegEncContext * const s= (MpegEncContext *)c;
2960 const uint8_t *scantable= s->intra_scantable.permutated;
2961 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2962 uint64_t __align8 aligned_bak[stride];
2963 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2964 uint8_t * const bak= (uint8_t*)aligned_bak;
2965 int i, last, run, bits, level, distoration, start_i;
2966 const int esc_length= s->ac_esc_length;
2967 uint8_t * length;
2968 uint8_t * last_length;
2969
2970 assert(h==8);
2971
2972 for(i=0; i<8; i++){
2973 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2974 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2975 }
2976
2977 s->dsp.diff_pixels(temp, src1, src2, stride);
2978
2979 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2980
2981 bits=0;
2982
2983 if (s->mb_intra) {
2984 start_i = 1;
2985 length = s->intra_ac_vlc_length;
2986 last_length= s->intra_ac_vlc_last_length;
2987 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2988 } else {
2989 start_i = 0;
2990 length = s->inter_ac_vlc_length;
2991 last_length= s->inter_ac_vlc_last_length;
2992 }
2993
2994 if(last>=start_i){
2995 run=0;
2996 for(i=start_i; i<last; i++){
2997 int j= scantable[i];
2998 level= temp[j];
2999
3000 if(level){
3001 level+=64;
3002 if((level&(~127)) == 0){
3003 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3004 }else
3005 bits+= esc_length;
3006 run=0;
3007 }else
3008 run++;
3009 }
3010 i= scantable[last];
3011
3012 level= temp[i] + 64;
3013
3014 assert(level - 64);
3015
3016 if((level&(~127)) == 0){
3017 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3018 }else
3019 bits+= esc_length;
3020
3021 }
3022
3023 if(last>=0){
3024 if(s->mb_intra)
3025 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3026 else
3027 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3028 }
3029
3030 s->dsp.idct_add(bak, stride, temp);
3031
3032 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3033
3034 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3035 }
3036
3037 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3038 MpegEncContext * const s= (MpegEncContext *)c;
3039 const uint8_t *scantable= s->intra_scantable.permutated;
3040 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3041 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3042 int i, last, run, bits, level, start_i;
3043 const int esc_length= s->ac_esc_length;
3044 uint8_t * length;
3045 uint8_t * last_length;
3046
3047 assert(h==8);
3048
3049 s->dsp.diff_pixels(temp, src1, src2, stride);
3050
3051 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3052
3053 bits=0;
3054
3055 if (s->mb_intra) {
3056 start_i = 1;
3057 length = s->intra_ac_vlc_length;
3058 last_length= s->intra_ac_vlc_last_length;
3059 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3060 } else {
3061 start_i = 0;
3062 length = s->inter_ac_vlc_length;
3063 last_length= s->inter_ac_vlc_last_length;
3064 }
3065
3066 if(last>=start_i){
3067 run=0;
3068 for(i=start_i; i<last; i++){
3069 int j= scantable[i];
3070 level= temp[j];
3071
3072 if(level){
3073 level+=64;
3074 if((level&(~127)) == 0){
3075 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3076 }else
3077 bits+= esc_length;
3078 run=0;
3079 }else
3080 run++;
3081 }
3082 i= scantable[last];
3083
3084 level= temp[i] + 64;
3085
3086 assert(level - 64);
3087
3088 if((level&(~127)) == 0){
3089 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3090 }else
3091 bits+= esc_length;
3092 }
3093
3094 return bits;
3095 }
3096
3097 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3098 int score=0;
3099 int x,y;
3100
3101 for(y=1; y<h; y++){
3102 for(x=0; x<16; x+=4){
3103 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3104 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3105 }
3106 s+= stride;
3107 }
3108
3109 return score;
3110 }
3111
3112 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3113 int score=0;
3114 int x,y;
3115
3116 for(y=1; y<h; y++){
3117 for(x=0; x<16; x++){
3118 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3119 }
3120 s1+= stride;
3121 s2+= stride;
3122 }
3123
3124 return score;
3125 }
3126
3127 #define SQ(a) ((a)*(a))
3128 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3129 int score=0;
3130 int x,y;
3131
3132 for(y=1; y<h; y++){
3133 for(x=0; x<16; x+=4){
3134 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3135 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3136 }
3137 s+= stride;
3138 }
3139
3140 return score;
3141 }
3142
3143 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3144 int score=0;
3145 int x,y;
3146
3147 for(y=1; y<h; y++){
3148 for(x=0; x<16; x++){
3149 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3150 }
3151 s1+= stride;
3152 s2+= stride;
3153 }
3154
3155 return score;
3156 }
3157
3158 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3159 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3160 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3161 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3162 WARPER8_16_SQ(rd8x8_c, rd16_c)
3163 WARPER8_16_SQ(bit8x8_c, bit16_c)
3164
3165 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3166 converted */
3167 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3168 {
3169 j_rev_dct (block);
3170 put_pixels_clamped_c(block, dest, line_size);
3171 }
3172 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3173 {
3174 j_rev_dct (block);
3175 add_pixels_clamped_c(block, dest, line_size);
3176 }
3177
3178 /* init static data */
3179 void dsputil_static_init(void)
3180 {
3181 int i;
3182
3183 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3184 for(i=0;i<MAX_NEG_CROP;i++) {
3185 cropTbl[i] = 0;
3186 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3187 }
3188
3189 for(i=0;i<512;i++) {
3190 squareTbl[i] = (i - 256) * (i - 256);
3191 }
3192
3193 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3194 }
3195
3196
3197 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3198 {
3199 int i;
3200
3201 #ifdef CONFIG_ENCODERS
3202 if(avctx->dct_algo==FF_DCT_FASTINT) {
3203 c->fdct = fdct_ifast;
3204 c->fdct248 = fdct_ifast248;
3205 }
3206 else if(avctx->dct_algo==FF_DCT_FAAN) {
3207 c->fdct = ff_faandct;
3208 c->fdct248 = ff_faandct248;
3209 }
3210 else {
3211 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3212 c->fdct248 = ff_fdct248_islow;
3213 }
3214 #endif //CONFIG_ENCODERS
3215
3216 if(avctx->idct_algo==FF_IDCT_INT){
3217 c->idct_put= ff_jref_idct_put;
3218 c->idct_add= ff_jref_idct_add;
3219 c->idct = j_rev_dct;
3220 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3221 }else{ //accurate/default
3222 c->idct_put= simple_idct_put;
3223 c->idct_add= simple_idct_add;
3224 c->idct = simple_idct;
3225 c->idct_permutation_type= FF_NO_IDCT_PERM;
3226 }
3227
3228 /* VP3 DSP support */
3229 c->vp3_dsp_init = vp3_dsp_init_c;
3230 c->vp3_idct = vp3_idct_c;
3231
3232 c->get_pixels = get_pixels_c;
3233 c->diff_pixels = diff_pixels_c;
3234 c->put_pixels_clamped = put_pixels_clamped_c;
3235 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3236 c->add_pixels_clamped = add_pixels_clamped_c;
3237 c->gmc1 = gmc1_c;
3238 c->gmc = gmc_c;
3239 c->clear_blocks = clear_blocks_c;
3240 c->pix_sum = pix_sum_c;
3241 c->pix_norm1 = pix_norm1_c;
3242
3243 /* TODO [0] 16 [1] 8 */
3244 c->pix_abs[0][0] = pix_abs16_c;
3245 c->pix_abs[0][1] = pix_abs16_x2_c;
3246 c->pix_abs[0][2] = pix_abs16_y2_c;
3247 c->pix_abs[0][3] = pix_abs16_xy2_c;
3248 c->pix_abs[1][0] = pix_abs8_c;
3249 c->pix_abs[1][1] = pix_abs8_x2_c;
3250 c->pix_abs[1][2] = pix_abs8_y2_c;
3251 c->pix_abs[1][3] = pix_abs8_xy2_c;
3252
3253 #define dspfunc(PFX, IDX, NUM) \
3254 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3255 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3256 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3257 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3258
3259 dspfunc(put, 0, 16);
3260 dspfunc(put_no_rnd, 0, 16);
3261 dspfunc(put, 1, 8);
3262 dspfunc(put_no_rnd, 1, 8);
3263 dspfunc(put, 2, 4);
3264 dspfunc(put, 3, 2);
3265
3266 dspfunc(avg, 0, 16);
3267 dspfunc(avg_no_rnd, 0, 16);
3268 dspfunc(avg, 1, 8);
3269 dspfunc(avg_no_rnd, 1, 8);
3270 dspfunc(avg, 2, 4);
3271 dspfunc(avg, 3, 2);
3272 #undef dspfunc
3273
3274 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3275 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3276
3277 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3278 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3279 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3280 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3281 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3282 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3283 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3284 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3285 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3286
3287 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3288 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3289 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3290 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3291 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3292 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3293 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3294 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3295 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3296
3297 #define dspfunc(PFX, IDX, NUM) \
3298 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3299 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3300 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3301 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3302 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3303 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3304 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3305 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3306 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3307 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3308 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3309 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3310 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3311 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3312 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3313 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3314
3315 dspfunc(put_qpel, 0, 16);
3316 dspfunc(put_no_rnd_qpel, 0, 16);
3317
3318 dspfunc(avg_qpel, 0, 16);
3319 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3320
3321 dspfunc(put_qpel, 1, 8);
3322 dspfunc(put_no_rnd_qpel, 1, 8);
3323
3324 dspfunc(avg_qpel, 1, 8);
3325 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3326
3327 dspfunc(put_h264_qpel, 0, 16);
3328 dspfunc(put_h264_qpel, 1, 8);
3329 dspfunc(put_h264_qpel, 2, 4);
3330 dspfunc(avg_h264_qpel, 0, 16);
3331 dspfunc(avg_h264_qpel, 1, 8);
3332 dspfunc(avg_h264_qpel, 2, 4);
3333
3334 #undef dspfunc
3335 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3336 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3337 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3338 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3339 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3340 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3341
3342 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3343 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3344 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3345 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3346 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3347 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3348 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3349 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3350
3351 #define SET_CMP_FUNC(name) \
3352 c->name[0]= name ## 16_c;\
3353 c->name[1]= name ## 8x8_c;
3354
3355 SET_CMP_FUNC(hadamard8_diff)
3356 c->hadamard8_diff[4]= hadamard8_intra16_c;
3357 SET_CMP_FUNC(dct_sad)
3358 c->sad[0]= pix_abs16_c;
3359 c->sad[1]= pix_abs8_c;
3360 c->sse[0]= sse16_c;
3361 c->sse[1]= sse8_c;
3362 SET_CMP_FUNC(quant_psnr)
3363 SET_CMP_FUNC(rd)
3364 SET_CMP_FUNC(bit)
3365 c->vsad[0]= vsad16_c;
3366 c->vsad[4]= vsad_intra16_c;
3367 c->vsse[0]= vsse16_c;
3368 c->vsse[4]= vsse_intra16_c;
3369 c->nsse[0]= nsse16_c;
3370 c->nsse[1]= nsse8_c;
3371
3372 c->add_bytes= add_bytes_c;
3373 c->diff_bytes= diff_bytes_c;
3374 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3375 c->bswap_buf= bswap_buf;
3376
3377 c->h263_h_loop_filter= h263_h_loop_filter_c;
3378 c->h263_v_loop_filter= h263_v_loop_filter_c;
3379
3380 c->h261_loop_filter= h261_loop_filter_c;
3381
3382 c->try_8x8basis= try_8x8basis_c;
3383 c->add_8x8basis= add_8x8basis_c;
3384
3385 #ifdef HAVE_MMX
3386 dsputil_init_mmx(c, avctx);
3387 #endif
3388 #ifdef ARCH_ARMV4L
3389 dsputil_init_armv4l(c, avctx);
3390 #endif
3391 #ifdef HAVE_MLIB
3392 dsputil_init_mlib(c, avctx);
3393 #endif
3394 #ifdef ARCH_SPARC
3395 dsputil_init_vis(c,avctx);
3396 #endif
3397 #ifdef ARCH_ALPHA
3398 dsputil_init_alpha(c, avctx);
3399 #endif
3400 #ifdef ARCH_POWERPC
3401 dsputil_init_ppc(c, avctx);
3402 #endif
3403 #ifdef HAVE_MMI
3404 dsputil_init_mmi(c, avctx);
3405 #endif
3406 #ifdef ARCH_SH4
3407 dsputil_init_sh4(c,avctx);
3408 #endif
3409
3410 switch(c->idct_permutation_type){
3411 case FF_NO_IDCT_PERM:
3412 for(i=0; i<64; i++)
3413 c->idct_permutation[i]= i;
3414 break;
3415 case FF_LIBMPEG2_IDCT_PERM:
3416 for(i=0; i<64; i++)
3417 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3418 break;
3419 case FF_SIMPLE_IDCT_PERM:
3420 for(i=0; i<64; i++)
3421 c->idct_permutation[i]= simple_mmx_permutation[i];
3422 break;
3423 case FF_TRANSPOSE_IDCT_PERM:
3424 for(i=0; i<64; i++)
3425 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3426 break;
3427 default:
3428 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3429 }
3430 }
3431