* introducing dct248 into the DSP context.
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 */
21
22 /**
23 * @file dsputil.c
24 * DSP utils
25 */
26
27 #include "avcodec.h"
28 #include "dsputil.h"
29 #include "mpegvideo.h"
30 #include "simple_idct.h"
31 #include "faandct.h"
32
33 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34 uint32_t squareTbl[512];
35
36 const uint8_t ff_zigzag_direct[64] = {
37 0, 1, 8, 16, 9, 2, 3, 10,
38 17, 24, 32, 25, 18, 11, 4, 5,
39 12, 19, 26, 33, 40, 48, 41, 34,
40 27, 20, 13, 6, 7, 14, 21, 28,
41 35, 42, 49, 56, 57, 50, 43, 36,
42 29, 22, 15, 23, 30, 37, 44, 51,
43 58, 59, 52, 45, 38, 31, 39, 46,
44 53, 60, 61, 54, 47, 55, 62, 63
45 };
46
47 /* Specific zigzag scan for 248 idct. NOTE that unlike the
48 specification, we interleave the fields */
49 const uint8_t ff_zigzag248_direct[64] = {
50 0, 8, 1, 9, 16, 24, 2, 10,
51 17, 25, 32, 40, 48, 56, 33, 41,
52 18, 26, 3, 11, 4, 12, 19, 27,
53 34, 42, 49, 57, 50, 58, 35, 43,
54 20, 28, 5, 13, 6, 14, 21, 29,
55 36, 44, 51, 59, 52, 60, 37, 45,
56 22, 30, 7, 15, 23, 31, 38, 46,
57 53, 61, 54, 62, 39, 47, 55, 63,
58 };
59
60 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
61 uint16_t __align8 inv_zigzag_direct16[64];
62
63 const uint8_t ff_alternate_horizontal_scan[64] = {
64 0, 1, 2, 3, 8, 9, 16, 17,
65 10, 11, 4, 5, 6, 7, 15, 14,
66 13, 12, 19, 18, 24, 25, 32, 33,
67 26, 27, 20, 21, 22, 23, 28, 29,
68 30, 31, 34, 35, 40, 41, 48, 49,
69 42, 43, 36, 37, 38, 39, 44, 45,
70 46, 47, 50, 51, 56, 57, 58, 59,
71 52, 53, 54, 55, 60, 61, 62, 63,
72 };
73
74 const uint8_t ff_alternate_vertical_scan[64] = {
75 0, 8, 16, 24, 1, 9, 2, 10,
76 17, 25, 32, 40, 48, 56, 57, 49,
77 41, 33, 26, 18, 3, 11, 4, 12,
78 19, 27, 34, 42, 50, 58, 35, 43,
79 51, 59, 20, 28, 5, 13, 6, 14,
80 21, 29, 36, 44, 52, 60, 37, 45,
81 53, 61, 22, 30, 7, 15, 23, 31,
82 38, 46, 54, 62, 39, 47, 55, 63,
83 };
84
85 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
86 const uint32_t inverse[256]={
87 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
88 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
89 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
90 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
91 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
92 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
93 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
94 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
95 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
96 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
97 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
98 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
99 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
100 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
101 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
102 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
103 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
104 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
105 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
106 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
107 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
108 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
109 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
110 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
111 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
112 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
113 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
114 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
115 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
116 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
117 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
118 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
119 };
120
121 /* Input permutation for the simple_idct_mmx */
122 static const uint8_t simple_mmx_permutation[64]={
123 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
124 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
125 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
126 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
127 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
128 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
129 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
130 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
131 };
132
133 static int pix_sum_c(uint8_t * pix, int line_size)
134 {
135 int s, i, j;
136
137 s = 0;
138 for (i = 0; i < 16; i++) {
139 for (j = 0; j < 16; j += 8) {
140 s += pix[0];
141 s += pix[1];
142 s += pix[2];
143 s += pix[3];
144 s += pix[4];
145 s += pix[5];
146 s += pix[6];
147 s += pix[7];
148 pix += 8;
149 }
150 pix += line_size - 16;
151 }
152 return s;
153 }
154
155 static int pix_norm1_c(uint8_t * pix, int line_size)
156 {
157 int s, i, j;
158 uint32_t *sq = squareTbl + 256;
159
160 s = 0;
161 for (i = 0; i < 16; i++) {
162 for (j = 0; j < 16; j += 8) {
163 #if 0
164 s += sq[pix[0]];
165 s += sq[pix[1]];
166 s += sq[pix[2]];
167 s += sq[pix[3]];
168 s += sq[pix[4]];
169 s += sq[pix[5]];
170 s += sq[pix[6]];
171 s += sq[pix[7]];
172 #else
173 #if LONG_MAX > 2147483647
174 register uint64_t x=*(uint64_t*)pix;
175 s += sq[x&0xff];
176 s += sq[(x>>8)&0xff];
177 s += sq[(x>>16)&0xff];
178 s += sq[(x>>24)&0xff];
179 s += sq[(x>>32)&0xff];
180 s += sq[(x>>40)&0xff];
181 s += sq[(x>>48)&0xff];
182 s += sq[(x>>56)&0xff];
183 #else
184 register uint32_t x=*(uint32_t*)pix;
185 s += sq[x&0xff];
186 s += sq[(x>>8)&0xff];
187 s += sq[(x>>16)&0xff];
188 s += sq[(x>>24)&0xff];
189 x=*(uint32_t*)(pix+4);
190 s += sq[x&0xff];
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 #endif
195 #endif
196 pix += 8;
197 }
198 pix += line_size - 16;
199 }
200 return s;
201 }
202
203 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
204 int i;
205
206 for(i=0; i+8<=w; i+=8){
207 dst[i+0]= bswap_32(src[i+0]);
208 dst[i+1]= bswap_32(src[i+1]);
209 dst[i+2]= bswap_32(src[i+2]);
210 dst[i+3]= bswap_32(src[i+3]);
211 dst[i+4]= bswap_32(src[i+4]);
212 dst[i+5]= bswap_32(src[i+5]);
213 dst[i+6]= bswap_32(src[i+6]);
214 dst[i+7]= bswap_32(src[i+7]);
215 }
216 for(;i<w; i++){
217 dst[i+0]= bswap_32(src[i+0]);
218 }
219 }
220
221 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
222 {
223 int s, i;
224 uint32_t *sq = squareTbl + 256;
225
226 s = 0;
227 for (i = 0; i < 8; i++) {
228 s += sq[pix1[0] - pix2[0]];
229 s += sq[pix1[1] - pix2[1]];
230 s += sq[pix1[2] - pix2[2]];
231 s += sq[pix1[3] - pix2[3]];
232 s += sq[pix1[4] - pix2[4]];
233 s += sq[pix1[5] - pix2[5]];
234 s += sq[pix1[6] - pix2[6]];
235 s += sq[pix1[7] - pix2[7]];
236 pix1 += line_size;
237 pix2 += line_size;
238 }
239 return s;
240 }
241
242 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
243 {
244 int s, i;
245 uint32_t *sq = squareTbl + 256;
246
247 s = 0;
248 for (i = 0; i < 16; i++) {
249 s += sq[pix1[ 0] - pix2[ 0]];
250 s += sq[pix1[ 1] - pix2[ 1]];
251 s += sq[pix1[ 2] - pix2[ 2]];
252 s += sq[pix1[ 3] - pix2[ 3]];
253 s += sq[pix1[ 4] - pix2[ 4]];
254 s += sq[pix1[ 5] - pix2[ 5]];
255 s += sq[pix1[ 6] - pix2[ 6]];
256 s += sq[pix1[ 7] - pix2[ 7]];
257 s += sq[pix1[ 8] - pix2[ 8]];
258 s += sq[pix1[ 9] - pix2[ 9]];
259 s += sq[pix1[10] - pix2[10]];
260 s += sq[pix1[11] - pix2[11]];
261 s += sq[pix1[12] - pix2[12]];
262 s += sq[pix1[13] - pix2[13]];
263 s += sq[pix1[14] - pix2[14]];
264 s += sq[pix1[15] - pix2[15]];
265
266 pix1 += line_size;
267 pix2 += line_size;
268 }
269 return s;
270 }
271
272 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
273 {
274 int i;
275
276 /* read the pixels */
277 for(i=0;i<8;i++) {
278 block[0] = pixels[0];
279 block[1] = pixels[1];
280 block[2] = pixels[2];
281 block[3] = pixels[3];
282 block[4] = pixels[4];
283 block[5] = pixels[5];
284 block[6] = pixels[6];
285 block[7] = pixels[7];
286 pixels += line_size;
287 block += 8;
288 }
289 }
290
291 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
292 const uint8_t *s2, int stride){
293 int i;
294
295 /* read the pixels */
296 for(i=0;i<8;i++) {
297 block[0] = s1[0] - s2[0];
298 block[1] = s1[1] - s2[1];
299 block[2] = s1[2] - s2[2];
300 block[3] = s1[3] - s2[3];
301 block[4] = s1[4] - s2[4];
302 block[5] = s1[5] - s2[5];
303 block[6] = s1[6] - s2[6];
304 block[7] = s1[7] - s2[7];
305 s1 += stride;
306 s2 += stride;
307 block += 8;
308 }
309 }
310
311
312 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
313 int line_size)
314 {
315 int i;
316 uint8_t *cm = cropTbl + MAX_NEG_CROP;
317
318 /* read the pixels */
319 for(i=0;i<8;i++) {
320 pixels[0] = cm[block[0]];
321 pixels[1] = cm[block[1]];
322 pixels[2] = cm[block[2]];
323 pixels[3] = cm[block[3]];
324 pixels[4] = cm[block[4]];
325 pixels[5] = cm[block[5]];
326 pixels[6] = cm[block[6]];
327 pixels[7] = cm[block[7]];
328
329 pixels += line_size;
330 block += 8;
331 }
332 }
333
334 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
335 int line_size)
336 {
337 int i;
338 uint8_t *cm = cropTbl + MAX_NEG_CROP;
339
340 /* read the pixels */
341 for(i=0;i<8;i++) {
342 pixels[0] = cm[pixels[0] + block[0]];
343 pixels[1] = cm[pixels[1] + block[1]];
344 pixels[2] = cm[pixels[2] + block[2]];
345 pixels[3] = cm[pixels[3] + block[3]];
346 pixels[4] = cm[pixels[4] + block[4]];
347 pixels[5] = cm[pixels[5] + block[5]];
348 pixels[6] = cm[pixels[6] + block[6]];
349 pixels[7] = cm[pixels[7] + block[7]];
350 pixels += line_size;
351 block += 8;
352 }
353 }
354 #if 0
355
356 #define PIXOP2(OPNAME, OP) \
357 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
358 {\
359 int i;\
360 for(i=0; i<h; i++){\
361 OP(*((uint64_t*)block), LD64(pixels));\
362 pixels+=line_size;\
363 block +=line_size;\
364 }\
365 }\
366 \
367 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
368 {\
369 int i;\
370 for(i=0; i<h; i++){\
371 const uint64_t a= LD64(pixels );\
372 const uint64_t b= LD64(pixels+1);\
373 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
374 pixels+=line_size;\
375 block +=line_size;\
376 }\
377 }\
378 \
379 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
380 {\
381 int i;\
382 for(i=0; i<h; i++){\
383 const uint64_t a= LD64(pixels );\
384 const uint64_t b= LD64(pixels+1);\
385 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
386 pixels+=line_size;\
387 block +=line_size;\
388 }\
389 }\
390 \
391 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
392 {\
393 int i;\
394 for(i=0; i<h; i++){\
395 const uint64_t a= LD64(pixels );\
396 const uint64_t b= LD64(pixels+line_size);\
397 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
398 pixels+=line_size;\
399 block +=line_size;\
400 }\
401 }\
402 \
403 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
404 {\
405 int i;\
406 for(i=0; i<h; i++){\
407 const uint64_t a= LD64(pixels );\
408 const uint64_t b= LD64(pixels+line_size);\
409 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
410 pixels+=line_size;\
411 block +=line_size;\
412 }\
413 }\
414 \
415 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
416 {\
417 int i;\
418 const uint64_t a= LD64(pixels );\
419 const uint64_t b= LD64(pixels+1);\
420 uint64_t l0= (a&0x0303030303030303ULL)\
421 + (b&0x0303030303030303ULL)\
422 + 0x0202020202020202ULL;\
423 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
424 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
425 uint64_t l1,h1;\
426 \
427 pixels+=line_size;\
428 for(i=0; i<h; i+=2){\
429 uint64_t a= LD64(pixels );\
430 uint64_t b= LD64(pixels+1);\
431 l1= (a&0x0303030303030303ULL)\
432 + (b&0x0303030303030303ULL);\
433 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
434 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
435 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
436 pixels+=line_size;\
437 block +=line_size;\
438 a= LD64(pixels );\
439 b= LD64(pixels+1);\
440 l0= (a&0x0303030303030303ULL)\
441 + (b&0x0303030303030303ULL)\
442 + 0x0202020202020202ULL;\
443 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
444 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
445 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
446 pixels+=line_size;\
447 block +=line_size;\
448 }\
449 }\
450 \
451 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
452 {\
453 int i;\
454 const uint64_t a= LD64(pixels );\
455 const uint64_t b= LD64(pixels+1);\
456 uint64_t l0= (a&0x0303030303030303ULL)\
457 + (b&0x0303030303030303ULL)\
458 + 0x0101010101010101ULL;\
459 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
460 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
461 uint64_t l1,h1;\
462 \
463 pixels+=line_size;\
464 for(i=0; i<h; i+=2){\
465 uint64_t a= LD64(pixels );\
466 uint64_t b= LD64(pixels+1);\
467 l1= (a&0x0303030303030303ULL)\
468 + (b&0x0303030303030303ULL);\
469 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
470 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
471 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
472 pixels+=line_size;\
473 block +=line_size;\
474 a= LD64(pixels );\
475 b= LD64(pixels+1);\
476 l0= (a&0x0303030303030303ULL)\
477 + (b&0x0303030303030303ULL)\
478 + 0x0101010101010101ULL;\
479 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
480 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
481 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
482 pixels+=line_size;\
483 block +=line_size;\
484 }\
485 }\
486 \
487 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
488 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
489 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
490 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
491 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
492 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
494
495 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
496 #else // 64 bit variant
497
498 #define PIXOP2(OPNAME, OP) \
499 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
500 int i;\
501 for(i=0; i<h; i++){\
502 OP(*((uint16_t*)(block )), LD16(pixels ));\
503 pixels+=line_size;\
504 block +=line_size;\
505 }\
506 }\
507 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
508 int i;\
509 for(i=0; i<h; i++){\
510 OP(*((uint32_t*)(block )), LD32(pixels ));\
511 pixels+=line_size;\
512 block +=line_size;\
513 }\
514 }\
515 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
516 int i;\
517 for(i=0; i<h; i++){\
518 OP(*((uint32_t*)(block )), LD32(pixels ));\
519 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
520 pixels+=line_size;\
521 block +=line_size;\
522 }\
523 }\
524 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
525 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
526 }\
527 \
528 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
529 int src_stride1, int src_stride2, int h){\
530 int i;\
531 for(i=0; i<h; i++){\
532 uint32_t a,b;\
533 a= LD32(&src1[i*src_stride1 ]);\
534 b= LD32(&src2[i*src_stride2 ]);\
535 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
536 a= LD32(&src1[i*src_stride1+4]);\
537 b= LD32(&src2[i*src_stride2+4]);\
538 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
539 }\
540 }\
541 \
542 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
543 int src_stride1, int src_stride2, int h){\
544 int i;\
545 for(i=0; i<h; i++){\
546 uint32_t a,b;\
547 a= LD32(&src1[i*src_stride1 ]);\
548 b= LD32(&src2[i*src_stride2 ]);\
549 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
550 a= LD32(&src1[i*src_stride1+4]);\
551 b= LD32(&src2[i*src_stride2+4]);\
552 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
553 }\
554 }\
555 \
556 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
557 int src_stride1, int src_stride2, int h){\
558 int i;\
559 for(i=0; i<h; i++){\
560 uint32_t a,b;\
561 a= LD32(&src1[i*src_stride1 ]);\
562 b= LD32(&src2[i*src_stride2 ]);\
563 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
564 }\
565 }\
566 \
567 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
568 int src_stride1, int src_stride2, int h){\
569 int i;\
570 for(i=0; i<h; i++){\
571 uint32_t a,b;\
572 a= LD16(&src1[i*src_stride1 ]);\
573 b= LD16(&src2[i*src_stride2 ]);\
574 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
575 }\
576 }\
577 \
578 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579 int src_stride1, int src_stride2, int h){\
580 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
581 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
582 }\
583 \
584 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
585 int src_stride1, int src_stride2, int h){\
586 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
587 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
588 }\
589 \
590 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
591 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
592 }\
593 \
594 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
595 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
596 }\
597 \
598 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
599 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
600 }\
601 \
602 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
603 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
604 }\
605 \
606 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
607 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
608 int i;\
609 for(i=0; i<h; i++){\
610 uint32_t a, b, c, d, l0, l1, h0, h1;\
611 a= LD32(&src1[i*src_stride1]);\
612 b= LD32(&src2[i*src_stride2]);\
613 c= LD32(&src3[i*src_stride3]);\
614 d= LD32(&src4[i*src_stride4]);\
615 l0= (a&0x03030303UL)\
616 + (b&0x03030303UL)\
617 + 0x02020202UL;\
618 h0= ((a&0xFCFCFCFCUL)>>2)\
619 + ((b&0xFCFCFCFCUL)>>2);\
620 l1= (c&0x03030303UL)\
621 + (d&0x03030303UL);\
622 h1= ((c&0xFCFCFCFCUL)>>2)\
623 + ((d&0xFCFCFCFCUL)>>2);\
624 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
625 a= LD32(&src1[i*src_stride1+4]);\
626 b= LD32(&src2[i*src_stride2+4]);\
627 c= LD32(&src3[i*src_stride3+4]);\
628 d= LD32(&src4[i*src_stride4+4]);\
629 l0= (a&0x03030303UL)\
630 + (b&0x03030303UL)\
631 + 0x02020202UL;\
632 h0= ((a&0xFCFCFCFCUL)>>2)\
633 + ((b&0xFCFCFCFCUL)>>2);\
634 l1= (c&0x03030303UL)\
635 + (d&0x03030303UL);\
636 h1= ((c&0xFCFCFCFCUL)>>2)\
637 + ((d&0xFCFCFCFCUL)>>2);\
638 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
639 }\
640 }\
641 \
642 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
643 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
644 }\
645 \
646 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
647 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
648 }\
649 \
650 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
651 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
652 }\
653 \
654 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
655 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
656 }\
657 \
658 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
659 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
660 int i;\
661 for(i=0; i<h; i++){\
662 uint32_t a, b, c, d, l0, l1, h0, h1;\
663 a= LD32(&src1[i*src_stride1]);\
664 b= LD32(&src2[i*src_stride2]);\
665 c= LD32(&src3[i*src_stride3]);\
666 d= LD32(&src4[i*src_stride4]);\
667 l0= (a&0x03030303UL)\
668 + (b&0x03030303UL)\
669 + 0x01010101UL;\
670 h0= ((a&0xFCFCFCFCUL)>>2)\
671 + ((b&0xFCFCFCFCUL)>>2);\
672 l1= (c&0x03030303UL)\
673 + (d&0x03030303UL);\
674 h1= ((c&0xFCFCFCFCUL)>>2)\
675 + ((d&0xFCFCFCFCUL)>>2);\
676 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
677 a= LD32(&src1[i*src_stride1+4]);\
678 b= LD32(&src2[i*src_stride2+4]);\
679 c= LD32(&src3[i*src_stride3+4]);\
680 d= LD32(&src4[i*src_stride4+4]);\
681 l0= (a&0x03030303UL)\
682 + (b&0x03030303UL)\
683 + 0x01010101UL;\
684 h0= ((a&0xFCFCFCFCUL)>>2)\
685 + ((b&0xFCFCFCFCUL)>>2);\
686 l1= (c&0x03030303UL)\
687 + (d&0x03030303UL);\
688 h1= ((c&0xFCFCFCFCUL)>>2)\
689 + ((d&0xFCFCFCFCUL)>>2);\
690 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
691 }\
692 }\
693 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
694 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
695 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
696 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697 }\
698 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
699 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
700 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
701 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702 }\
703 \
704 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
705 {\
706 int i, a0, b0, a1, b1;\
707 a0= pixels[0];\
708 b0= pixels[1] + 2;\
709 a0 += b0;\
710 b0 += pixels[2];\
711 \
712 pixels+=line_size;\
713 for(i=0; i<h; i+=2){\
714 a1= pixels[0];\
715 b1= pixels[1];\
716 a1 += b1;\
717 b1 += pixels[2];\
718 \
719 block[0]= (a1+a0)>>2; /* FIXME non put */\
720 block[1]= (b1+b0)>>2;\
721 \
722 pixels+=line_size;\
723 block +=line_size;\
724 \
725 a0= pixels[0];\
726 b0= pixels[1] + 2;\
727 a0 += b0;\
728 b0 += pixels[2];\
729 \
730 block[0]= (a1+a0)>>2;\
731 block[1]= (b1+b0)>>2;\
732 pixels+=line_size;\
733 block +=line_size;\
734 }\
735 }\
736 \
737 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
738 {\
739 int i;\
740 const uint32_t a= LD32(pixels );\
741 const uint32_t b= LD32(pixels+1);\
742 uint32_t l0= (a&0x03030303UL)\
743 + (b&0x03030303UL)\
744 + 0x02020202UL;\
745 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
746 + ((b&0xFCFCFCFCUL)>>2);\
747 uint32_t l1,h1;\
748 \
749 pixels+=line_size;\
750 for(i=0; i<h; i+=2){\
751 uint32_t a= LD32(pixels );\
752 uint32_t b= LD32(pixels+1);\
753 l1= (a&0x03030303UL)\
754 + (b&0x03030303UL);\
755 h1= ((a&0xFCFCFCFCUL)>>2)\
756 + ((b&0xFCFCFCFCUL)>>2);\
757 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
758 pixels+=line_size;\
759 block +=line_size;\
760 a= LD32(pixels );\
761 b= LD32(pixels+1);\
762 l0= (a&0x03030303UL)\
763 + (b&0x03030303UL)\
764 + 0x02020202UL;\
765 h0= ((a&0xFCFCFCFCUL)>>2)\
766 + ((b&0xFCFCFCFCUL)>>2);\
767 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
768 pixels+=line_size;\
769 block +=line_size;\
770 }\
771 }\
772 \
773 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
774 {\
775 int j;\
776 for(j=0; j<2; j++){\
777 int i;\
778 const uint32_t a= LD32(pixels );\
779 const uint32_t b= LD32(pixels+1);\
780 uint32_t l0= (a&0x03030303UL)\
781 + (b&0x03030303UL)\
782 + 0x02020202UL;\
783 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
784 + ((b&0xFCFCFCFCUL)>>2);\
785 uint32_t l1,h1;\
786 \
787 pixels+=line_size;\
788 for(i=0; i<h; i+=2){\
789 uint32_t a= LD32(pixels );\
790 uint32_t b= LD32(pixels+1);\
791 l1= (a&0x03030303UL)\
792 + (b&0x03030303UL);\
793 h1= ((a&0xFCFCFCFCUL)>>2)\
794 + ((b&0xFCFCFCFCUL)>>2);\
795 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
796 pixels+=line_size;\
797 block +=line_size;\
798 a= LD32(pixels );\
799 b= LD32(pixels+1);\
800 l0= (a&0x03030303UL)\
801 + (b&0x03030303UL)\
802 + 0x02020202UL;\
803 h0= ((a&0xFCFCFCFCUL)>>2)\
804 + ((b&0xFCFCFCFCUL)>>2);\
805 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
806 pixels+=line_size;\
807 block +=line_size;\
808 }\
809 pixels+=4-line_size*(h+1);\
810 block +=4-line_size*h;\
811 }\
812 }\
813 \
814 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
815 {\
816 int j;\
817 for(j=0; j<2; j++){\
818 int i;\
819 const uint32_t a= LD32(pixels );\
820 const uint32_t b= LD32(pixels+1);\
821 uint32_t l0= (a&0x03030303UL)\
822 + (b&0x03030303UL)\
823 + 0x01010101UL;\
824 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
825 + ((b&0xFCFCFCFCUL)>>2);\
826 uint32_t l1,h1;\
827 \
828 pixels+=line_size;\
829 for(i=0; i<h; i+=2){\
830 uint32_t a= LD32(pixels );\
831 uint32_t b= LD32(pixels+1);\
832 l1= (a&0x03030303UL)\
833 + (b&0x03030303UL);\
834 h1= ((a&0xFCFCFCFCUL)>>2)\
835 + ((b&0xFCFCFCFCUL)>>2);\
836 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
837 pixels+=line_size;\
838 block +=line_size;\
839 a= LD32(pixels );\
840 b= LD32(pixels+1);\
841 l0= (a&0x03030303UL)\
842 + (b&0x03030303UL)\
843 + 0x01010101UL;\
844 h0= ((a&0xFCFCFCFCUL)>>2)\
845 + ((b&0xFCFCFCFCUL)>>2);\
846 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
847 pixels+=line_size;\
848 block +=line_size;\
849 }\
850 pixels+=4-line_size*(h+1);\
851 block +=4-line_size*h;\
852 }\
853 }\
854 \
855 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
856 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
857 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
858 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
859 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
860 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
863
864 #define op_avg(a, b) a = rnd_avg32(a, b)
865 #endif
866 #define op_put(a, b) a = b
867
868 PIXOP2(avg, op_avg)
869 PIXOP2(put, op_put)
870 #undef op_avg
871 #undef op_put
872
873 #define avg2(a,b) ((a+b+1)>>1)
874 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
875
876
877 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
878 {
879 const int A=(16-x16)*(16-y16);
880 const int B=( x16)*(16-y16);
881 const int C=(16-x16)*( y16);
882 const int D=( x16)*( y16);
883 int i;
884
885 for(i=0; i<h; i++)
886 {
887 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
888 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
889 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
890 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
891 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
892 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
893 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
894 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
895 dst+= stride;
896 src+= stride;
897 }
898 }
899
900 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
901 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
902 {
903 int y, vx, vy;
904 const int s= 1<<shift;
905
906 width--;
907 height--;
908
909 for(y=0; y<h; y++){
910 int x;
911
912 vx= ox;
913 vy= oy;
914 for(x=0; x<8; x++){ //XXX FIXME optimize
915 int src_x, src_y, frac_x, frac_y, index;
916
917 src_x= vx>>16;
918 src_y= vy>>16;
919 frac_x= src_x&(s-1);
920 frac_y= src_y&(s-1);
921 src_x>>=shift;
922 src_y>>=shift;
923
924 if((unsigned)src_x < width){
925 if((unsigned)src_y < height){
926 index= src_x + src_y*stride;
927 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
928 + src[index +1]* frac_x )*(s-frac_y)
929 + ( src[index+stride ]*(s-frac_x)
930 + src[index+stride+1]* frac_x )* frac_y
931 + r)>>(shift*2);
932 }else{
933 index= src_x + clip(src_y, 0, height)*stride;
934 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
935 + src[index +1]* frac_x )*s
936 + r)>>(shift*2);
937 }
938 }else{
939 if((unsigned)src_y < height){
940 index= clip(src_x, 0, width) + src_y*stride;
941 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
942 + src[index+stride ]* frac_y )*s
943 + r)>>(shift*2);
944 }else{
945 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
946 dst[y*stride + x]= src[index ];
947 }
948 }
949
950 vx+= dxx;
951 vy+= dyx;
952 }
953 ox += dxy;
954 oy += dyy;
955 }
956 }
957
958 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
959 switch(width){
960 case 2: put_pixels2_c (dst, src, stride, height); break;
961 case 4: put_pixels4_c (dst, src, stride, height); break;
962 case 8: put_pixels8_c (dst, src, stride, height); break;
963 case 16:put_pixels16_c(dst, src, stride, height); break;
964 }
965 }
966
967 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
968 int i,j;
969 for (i=0; i < height; i++) {
970 for (j=0; j < width; j++) {
971 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
972 }
973 src += stride;
974 dst += stride;
975 }
976 }
977
978 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
979 int i,j;
980 for (i=0; i < height; i++) {
981 for (j=0; j < width; j++) {
982 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
983 }
984 src += stride;
985 dst += stride;
986 }
987 }
988
989 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
990 int i,j;
991 for (i=0; i < height; i++) {
992 for (j=0; j < width; j++) {
993 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
994 }
995 src += stride;
996 dst += stride;
997 }
998 }
999
1000 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1001 int i,j;
1002 for (i=0; i < height; i++) {
1003 for (j=0; j < width; j++) {
1004 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1005 }
1006 src += stride;
1007 dst += stride;
1008 }
1009 }
1010
1011 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1012 int i,j;
1013 for (i=0; i < height; i++) {
1014 for (j=0; j < width; j++) {
1015 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1016 }
1017 src += stride;
1018 dst += stride;
1019 }
1020 }
1021
1022 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1023 int i,j;
1024 for (i=0; i < height; i++) {
1025 for (j=0; j < width; j++) {
1026 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1027 }
1028 src += stride;
1029 dst += stride;
1030 }
1031 }
1032
1033 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1034 int i,j;
1035 for (i=0; i < height; i++) {
1036 for (j=0; j < width; j++) {
1037 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1038 }
1039 src += stride;
1040 dst += stride;
1041 }
1042 }
1043
1044 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1045 int i,j;
1046 for (i=0; i < height; i++) {
1047 for (j=0; j < width; j++) {
1048 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1049 }
1050 src += stride;
1051 dst += stride;
1052 }
1053 }
1054
1055 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1056 switch(width){
1057 case 2: avg_pixels2_c (dst, src, stride, height); break;
1058 case 4: avg_pixels4_c (dst, src, stride, height); break;
1059 case 8: avg_pixels8_c (dst, src, stride, height); break;
1060 case 16:avg_pixels16_c(dst, src, stride, height); break;
1061 }
1062 }
1063
1064 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1065 int i,j;
1066 for (i=0; i < height; i++) {
1067 for (j=0; j < width; j++) {
1068 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1069 }
1070 src += stride;
1071 dst += stride;
1072 }
1073 }
1074
1075 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1076 int i,j;
1077 for (i=0; i < height; i++) {
1078 for (j=0; j < width; j++) {
1079 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1080 }
1081 src += stride;
1082 dst += stride;
1083 }
1084 }
1085
1086 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1087 int i,j;
1088 for (i=0; i < height; i++) {
1089 for (j=0; j < width; j++) {
1090 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1091 }
1092 src += stride;
1093 dst += stride;
1094 }
1095 }
1096
1097 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1098 int i,j;
1099 for (i=0; i < height; i++) {
1100 for (j=0; j < width; j++) {
1101 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1102 }
1103 src += stride;
1104 dst += stride;
1105 }
1106 }
1107
1108 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1109 int i,j;
1110 for (i=0; i < height; i++) {
1111 for (j=0; j < width; j++) {
1112 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1113 }
1114 src += stride;
1115 dst += stride;
1116 }
1117 }
1118
1119 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1120 int i,j;
1121 for (i=0; i < height; i++) {
1122 for (j=0; j < width; j++) {
1123 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1124 }
1125 src += stride;
1126 dst += stride;
1127 }
1128 }
1129
1130 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1131 int i,j;
1132 for (i=0; i < height; i++) {
1133 for (j=0; j < width; j++) {
1134 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1135 }
1136 src += stride;
1137 dst += stride;
1138 }
1139 }
1140
1141 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1142 int i,j;
1143 for (i=0; i < height; i++) {
1144 for (j=0; j < width; j++) {
1145 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1146 }
1147 src += stride;
1148 dst += stride;
1149 }
1150 }
1151 #if 0
1152 #define TPEL_WIDTH(width)\
1153 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1154 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1155 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1156 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1157 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1158 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1159 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1160 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1161 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1163 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1165 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1167 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1169 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1171 #endif
1172
1173 #define H264_CHROMA_MC(OPNAME, OP)\
1174 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1175 const int A=(8-x)*(8-y);\
1176 const int B=( x)*(8-y);\
1177 const int C=(8-x)*( y);\
1178 const int D=( x)*( y);\
1179 int i;\
1180 \
1181 assert(x<8 && y<8 && x>=0 && y>=0);\
1182 \
1183 for(i=0; i<h; i++)\
1184 {\
1185 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1186 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1187 dst+= stride;\
1188 src+= stride;\
1189 }\
1190 }\
1191 \
1192 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1193 const int A=(8-x)*(8-y);\
1194 const int B=( x)*(8-y);\
1195 const int C=(8-x)*( y);\
1196 const int D=( x)*( y);\
1197 int i;\
1198 \
1199 assert(x<8 && y<8 && x>=0 && y>=0);\
1200 \
1201 for(i=0; i<h; i++)\
1202 {\
1203 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1204 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1205 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1206 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1207 dst+= stride;\
1208 src+= stride;\
1209 }\
1210 }\
1211 \
1212 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1213 const int A=(8-x)*(8-y);\
1214 const int B=( x)*(8-y);\
1215 const int C=(8-x)*( y);\
1216 const int D=( x)*( y);\
1217 int i;\
1218 \
1219 assert(x<8 && y<8 && x>=0 && y>=0);\
1220 \
1221 for(i=0; i<h; i++)\
1222 {\
1223 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1224 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1225 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1226 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1227 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1228 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1229 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1230 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1231 dst+= stride;\
1232 src+= stride;\
1233 }\
1234 }
1235
1236 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1237 #define op_put(a, b) a = (((b) + 32)>>6)
1238
1239 H264_CHROMA_MC(put_ , op_put)
1240 H264_CHROMA_MC(avg_ , op_avg)
1241 #undef op_avg
1242 #undef op_put
1243
1244 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1245 {
1246 int i;
1247 for(i=0; i<h; i++)
1248 {
1249 ST32(dst , LD32(src ));
1250 dst+=dstStride;
1251 src+=srcStride;
1252 }
1253 }
1254
1255 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1256 {
1257 int i;
1258 for(i=0; i<h; i++)
1259 {
1260 ST32(dst , LD32(src ));
1261 ST32(dst+4 , LD32(src+4 ));
1262 dst+=dstStride;
1263 src+=srcStride;
1264 }
1265 }
1266
1267 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1268 {
1269 int i;
1270 for(i=0; i<h; i++)
1271 {
1272 ST32(dst , LD32(src ));
1273 ST32(dst+4 , LD32(src+4 ));
1274 ST32(dst+8 , LD32(src+8 ));
1275 ST32(dst+12, LD32(src+12));
1276 dst+=dstStride;
1277 src+=srcStride;
1278 }
1279 }
1280
1281 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1282 {
1283 int i;
1284 for(i=0; i<h; i++)
1285 {
1286 ST32(dst , LD32(src ));
1287 ST32(dst+4 , LD32(src+4 ));
1288 ST32(dst+8 , LD32(src+8 ));
1289 ST32(dst+12, LD32(src+12));
1290 dst[16]= src[16];
1291 dst+=dstStride;
1292 src+=srcStride;
1293 }
1294 }
1295
1296 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297 {
1298 int i;
1299 for(i=0; i<h; i++)
1300 {
1301 ST32(dst , LD32(src ));
1302 ST32(dst+4 , LD32(src+4 ));
1303 dst[8]= src[8];
1304 dst+=dstStride;
1305 src+=srcStride;
1306 }
1307 }
1308
1309
1310 #define QPEL_MC(r, OPNAME, RND, OP) \
1311 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1312 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1313 int i;\
1314 for(i=0; i<h; i++)\
1315 {\
1316 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1317 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1318 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1319 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1320 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1321 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1322 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1323 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1324 dst+=dstStride;\
1325 src+=srcStride;\
1326 }\
1327 }\
1328 \
1329 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1330 const int w=8;\
1331 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1332 int i;\
1333 for(i=0; i<w; i++)\
1334 {\
1335 const int src0= src[0*srcStride];\
1336 const int src1= src[1*srcStride];\
1337 const int src2= src[2*srcStride];\
1338 const int src3= src[3*srcStride];\
1339 const int src4= src[4*srcStride];\
1340 const int src5= src[5*srcStride];\
1341 const int src6= src[6*srcStride];\
1342 const int src7= src[7*srcStride];\
1343 const int src8= src[8*srcStride];\
1344 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1345 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1346 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1347 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1348 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1349 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1350 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1351 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1352 dst++;\
1353 src++;\
1354 }\
1355 }\
1356 \
1357 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1358 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1359 int i;\
1360 \
1361 for(i=0; i<h; i++)\
1362 {\
1363 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1364 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1365 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1366 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1367 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1368 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1369 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1370 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1371 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1372 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1373 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1374 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1375 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1376 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1377 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1378 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1379 dst+=dstStride;\
1380 src+=srcStride;\
1381 }\
1382 }\
1383 \
1384 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1385 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1386 int i;\
1387 const int w=16;\
1388 for(i=0; i<w; i++)\
1389 {\
1390 const int src0= src[0*srcStride];\
1391 const int src1= src[1*srcStride];\
1392 const int src2= src[2*srcStride];\
1393 const int src3= src[3*srcStride];\
1394 const int src4= src[4*srcStride];\
1395 const int src5= src[5*srcStride];\
1396 const int src6= src[6*srcStride];\
1397 const int src7= src[7*srcStride];\
1398 const int src8= src[8*srcStride];\
1399 const int src9= src[9*srcStride];\
1400 const int src10= src[10*srcStride];\
1401 const int src11= src[11*srcStride];\
1402 const int src12= src[12*srcStride];\
1403 const int src13= src[13*srcStride];\
1404 const int src14= src[14*srcStride];\
1405 const int src15= src[15*srcStride];\
1406 const int src16= src[16*srcStride];\
1407 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1408 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1409 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1410 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1411 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1412 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1413 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1414 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1415 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1416 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1417 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1418 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1419 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1420 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1421 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1422 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1423 dst++;\
1424 src++;\
1425 }\
1426 }\
1427 \
1428 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1429 OPNAME ## pixels8_c(dst, src, stride, 8);\
1430 }\
1431 \
1432 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1433 uint8_t half[64];\
1434 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1435 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1436 }\
1437 \
1438 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1439 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1440 }\
1441 \
1442 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1443 uint8_t half[64];\
1444 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1445 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1446 }\
1447 \
1448 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1449 uint8_t full[16*9];\
1450 uint8_t half[64];\
1451 copy_block9(full, src, 16, stride, 9);\
1452 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1453 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1454 }\
1455 \
1456 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1457 uint8_t full[16*9];\
1458 copy_block9(full, src, 16, stride, 9);\
1459 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1460 }\
1461 \
1462 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1463 uint8_t full[16*9];\
1464 uint8_t half[64];\
1465 copy_block9(full, src, 16, stride, 9);\
1466 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1467 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1468 }\
1469 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1470 uint8_t full[16*9];\
1471 uint8_t halfH[72];\
1472 uint8_t halfV[64];\
1473 uint8_t halfHV[64];\
1474 copy_block9(full, src, 16, stride, 9);\
1475 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1476 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1477 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1478 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1479 }\
1480 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1481 uint8_t full[16*9];\
1482 uint8_t halfH[72];\
1483 uint8_t halfHV[64];\
1484 copy_block9(full, src, 16, stride, 9);\
1485 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1486 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1487 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1488 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1489 }\
1490 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1491 uint8_t full[16*9];\
1492 uint8_t halfH[72];\
1493 uint8_t halfV[64];\
1494 uint8_t halfHV[64];\
1495 copy_block9(full, src, 16, stride, 9);\
1496 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1497 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1498 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1499 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1500 }\
1501 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1502 uint8_t full[16*9];\
1503 uint8_t halfH[72];\
1504 uint8_t halfHV[64];\
1505 copy_block9(full, src, 16, stride, 9);\
1506 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1507 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1508 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1509 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1510 }\
1511 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1512 uint8_t full[16*9];\
1513 uint8_t halfH[72];\
1514 uint8_t halfV[64];\
1515 uint8_t halfHV[64];\
1516 copy_block9(full, src, 16, stride, 9);\
1517 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1518 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1519 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1520 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1521 }\
1522 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1523 uint8_t full[16*9];\
1524 uint8_t halfH[72];\
1525 uint8_t halfHV[64];\
1526 copy_block9(full, src, 16, stride, 9);\
1527 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1528 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1529 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1530 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1531 }\
1532 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1533 uint8_t full[16*9];\
1534 uint8_t halfH[72];\
1535 uint8_t halfV[64];\
1536 uint8_t halfHV[64];\
1537 copy_block9(full, src, 16, stride, 9);\
1538 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1539 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1540 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1541 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1542 }\
1543 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1544 uint8_t full[16*9];\
1545 uint8_t halfH[72];\
1546 uint8_t halfHV[64];\
1547 copy_block9(full, src, 16, stride, 9);\
1548 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1549 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1550 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1551 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1552 }\
1553 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1554 uint8_t halfH[72];\
1555 uint8_t halfHV[64];\
1556 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1557 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1558 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1559 }\
1560 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1561 uint8_t halfH[72];\
1562 uint8_t halfHV[64];\
1563 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1564 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1565 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1566 }\
1567 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1568 uint8_t full[16*9];\
1569 uint8_t halfH[72];\
1570 uint8_t halfV[64];\
1571 uint8_t halfHV[64];\
1572 copy_block9(full, src, 16, stride, 9);\
1573 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1574 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1575 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1576 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1577 }\
1578 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1579 uint8_t full[16*9];\
1580 uint8_t halfH[72];\
1581 copy_block9(full, src, 16, stride, 9);\
1582 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1583 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1584 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1585 }\
1586 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1587 uint8_t full[16*9];\
1588 uint8_t halfH[72];\
1589 uint8_t halfV[64];\
1590 uint8_t halfHV[64];\
1591 copy_block9(full, src, 16, stride, 9);\
1592 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1593 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1594 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1595 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1596 }\
1597 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1598 uint8_t full[16*9];\
1599 uint8_t halfH[72];\
1600 copy_block9(full, src, 16, stride, 9);\
1601 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1602 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1603 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1604 }\
1605 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1606 uint8_t halfH[72];\
1607 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1608 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1609 }\
1610 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1611 OPNAME ## pixels16_c(dst, src, stride, 16);\
1612 }\
1613 \
1614 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1615 uint8_t half[256];\
1616 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1617 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1618 }\
1619 \
1620 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1621 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1622 }\
1623 \
1624 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1625 uint8_t half[256];\
1626 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1627 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1628 }\
1629 \
1630 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1631 uint8_t full[24*17];\
1632 uint8_t half[256];\
1633 copy_block17(full, src, 24, stride, 17);\
1634 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1635 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1636 }\
1637 \
1638 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1639 uint8_t full[24*17];\
1640 copy_block17(full, src, 24, stride, 17);\
1641 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1642 }\
1643 \
1644 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1645 uint8_t full[24*17];\
1646 uint8_t half[256];\
1647 copy_block17(full, src, 24, stride, 17);\
1648 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1649 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1650 }\
1651 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1652 uint8_t full[24*17];\
1653 uint8_t halfH[272];\
1654 uint8_t halfV[256];\
1655 uint8_t halfHV[256];\
1656 copy_block17(full, src, 24, stride, 17);\
1657 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1658 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1659 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1660 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1661 }\
1662 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1663 uint8_t full[24*17];\
1664 uint8_t halfH[272];\
1665 uint8_t halfHV[256];\
1666 copy_block17(full, src, 24, stride, 17);\
1667 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1668 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1669 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1670 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1671 }\
1672 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1673 uint8_t full[24*17];\
1674 uint8_t halfH[272];\
1675 uint8_t halfV[256];\
1676 uint8_t halfHV[256];\
1677 copy_block17(full, src, 24, stride, 17);\
1678 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1679 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1680 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1681 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1682 }\
1683 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1684 uint8_t full[24*17];\
1685 uint8_t halfH[272];\
1686 uint8_t halfHV[256];\
1687 copy_block17(full, src, 24, stride, 17);\
1688 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1689 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1690 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1691 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1692 }\
1693 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1694 uint8_t full[24*17];\
1695 uint8_t halfH[272];\
1696 uint8_t halfV[256];\
1697 uint8_t halfHV[256];\
1698 copy_block17(full, src, 24, stride, 17);\
1699 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1700 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1701 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1702 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1703 }\
1704 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1705 uint8_t full[24*17];\
1706 uint8_t halfH[272];\
1707 uint8_t halfHV[256];\
1708 copy_block17(full, src, 24, stride, 17);\
1709 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1710 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1711 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1712 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1713 }\
1714 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1715 uint8_t full[24*17];\
1716 uint8_t halfH[272];\
1717 uint8_t halfV[256];\
1718 uint8_t halfHV[256];\
1719 copy_block17(full, src, 24, stride, 17);\
1720 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1721 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1722 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1723 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1724 }\
1725 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1726 uint8_t full[24*17];\
1727 uint8_t halfH[272];\
1728 uint8_t halfHV[256];\
1729 copy_block17(full, src, 24, stride, 17);\
1730 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1731 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1732 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1733 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1734 }\
1735 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1736 uint8_t halfH[272];\
1737 uint8_t halfHV[256];\
1738 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1739 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1740 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1741 }\
1742 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1743 uint8_t halfH[272];\
1744 uint8_t halfHV[256];\
1745 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1746 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1747 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1748 }\
1749 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1750 uint8_t full[24*17];\
1751 uint8_t halfH[272];\
1752 uint8_t halfV[256];\
1753 uint8_t halfHV[256];\
1754 copy_block17(full, src, 24, stride, 17);\
1755 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1756 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1757 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1758 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1759 }\
1760 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1761 uint8_t full[24*17];\
1762 uint8_t halfH[272];\
1763 copy_block17(full, src, 24, stride, 17);\
1764 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1765 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1766 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1767 }\
1768 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769 uint8_t full[24*17];\
1770 uint8_t halfH[272];\
1771 uint8_t halfV[256];\
1772 uint8_t halfHV[256];\
1773 copy_block17(full, src, 24, stride, 17);\
1774 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1775 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1776 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1777 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1778 }\
1779 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1780 uint8_t full[24*17];\
1781 uint8_t halfH[272];\
1782 copy_block17(full, src, 24, stride, 17);\
1783 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1784 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1785 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1786 }\
1787 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1788 uint8_t halfH[272];\
1789 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1790 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1791 }
1792
1793 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1794 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1795 #define op_put(a, b) a = cm[((b) + 16)>>5]
1796 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1797
1798 QPEL_MC(0, put_ , _ , op_put)
1799 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1800 QPEL_MC(0, avg_ , _ , op_avg)
1801 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1802 #undef op_avg
1803 #undef op_avg_no_rnd
1804 #undef op_put
1805 #undef op_put_no_rnd
1806
1807 #if 1
1808 #define H264_LOWPASS(OPNAME, OP, OP2) \
1809 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1810 const int h=4;\
1811 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1812 int i;\
1813 for(i=0; i<h; i++)\
1814 {\
1815 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1816 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1817 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1818 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1819 dst+=dstStride;\
1820 src+=srcStride;\
1821 }\
1822 }\
1823 \
1824 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1825 const int w=4;\
1826 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1827 int i;\
1828 for(i=0; i<w; i++)\
1829 {\
1830 const int srcB= src[-2*srcStride];\
1831 const int srcA= src[-1*srcStride];\
1832 const int src0= src[0 *srcStride];\
1833 const int src1= src[1 *srcStride];\
1834 const int src2= src[2 *srcStride];\
1835 const int src3= src[3 *srcStride];\
1836 const int src4= src[4 *srcStride];\
1837 const int src5= src[5 *srcStride];\
1838 const int src6= src[6 *srcStride];\
1839 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1840 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1841 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1842 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1843 dst++;\
1844 src++;\
1845 }\
1846 }\
1847 \
1848 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1849 const int h=4;\
1850 const int w=4;\
1851 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1852 int i;\
1853 src -= 2*srcStride;\
1854 for(i=0; i<h+5; i++)\
1855 {\
1856 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1857 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1858 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1859 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1860 tmp+=tmpStride;\
1861 src+=srcStride;\
1862 }\
1863 tmp -= tmpStride*(h+5-2);\
1864 for(i=0; i<w; i++)\
1865 {\
1866 const int tmpB= tmp[-2*tmpStride];\
1867 const int tmpA= tmp[-1*tmpStride];\
1868 const int tmp0= tmp[0 *tmpStride];\
1869 const int tmp1= tmp[1 *tmpStride];\
1870 const int tmp2= tmp[2 *tmpStride];\
1871 const int tmp3= tmp[3 *tmpStride];\
1872 const int tmp4= tmp[4 *tmpStride];\
1873 const int tmp5= tmp[5 *tmpStride];\
1874 const int tmp6= tmp[6 *tmpStride];\
1875 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1876 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1877 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1878 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1879 dst++;\
1880 tmp++;\
1881 }\
1882 }\
1883 \
1884 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1885 const int h=8;\
1886 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1887 int i;\
1888 for(i=0; i<h; i++)\
1889 {\
1890 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1891 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1892 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1893 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1894 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1895 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1896 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1897 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1898 dst+=dstStride;\
1899 src+=srcStride;\
1900 }\
1901 }\
1902 \
1903 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1904 const int w=8;\
1905 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1906 int i;\
1907 for(i=0; i<w; i++)\
1908 {\
1909 const int srcB= src[-2*srcStride];\
1910 const int srcA= src[-1*srcStride];\
1911 const int src0= src[0 *srcStride];\
1912 const int src1= src[1 *srcStride];\
1913 const int src2= src[2 *srcStride];\
1914 const int src3= src[3 *srcStride];\
1915 const int src4= src[4 *srcStride];\
1916 const int src5= src[5 *srcStride];\
1917 const int src6= src[6 *srcStride];\
1918 const int src7= src[7 *srcStride];\
1919 const int src8= src[8 *srcStride];\
1920 const int src9= src[9 *srcStride];\
1921 const int src10=src[10*srcStride];\
1922 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1923 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1924 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1925 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1926 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1927 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1928 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1929 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1930 dst++;\
1931 src++;\
1932 }\
1933 }\
1934 \
1935 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1936 const int h=8;\
1937 const int w=8;\
1938 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1939 int i;\
1940 src -= 2*srcStride;\
1941 for(i=0; i<h+5; i++)\
1942 {\
1943 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1944 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1945 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1946 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1947 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1948 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1949 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1950 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1951 tmp+=tmpStride;\
1952 src+=srcStride;\
1953 }\
1954 tmp -= tmpStride*(h+5-2);\
1955 for(i=0; i<w; i++)\
1956 {\
1957 const int tmpB= tmp[-2*tmpStride];\
1958 const int tmpA= tmp[-1*tmpStride];\
1959 const int tmp0= tmp[0 *tmpStride];\
1960 const int tmp1= tmp[1 *tmpStride];\
1961 const int tmp2= tmp[2 *tmpStride];\
1962 const int tmp3= tmp[3 *tmpStride];\
1963 const int tmp4= tmp[4 *tmpStride];\
1964 const int tmp5= tmp[5 *tmpStride];\
1965 const int tmp6= tmp[6 *tmpStride];\
1966 const int tmp7= tmp[7 *tmpStride];\
1967 const int tmp8= tmp[8 *tmpStride];\
1968 const int tmp9= tmp[9 *tmpStride];\
1969 const int tmp10=tmp[10*tmpStride];\
1970 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1971 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1972 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1973 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1974 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1975 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1976 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1977 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1978 dst++;\
1979 tmp++;\
1980 }\
1981 }\
1982 \
1983 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1984 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1985 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1986 src += 8*srcStride;\
1987 dst += 8*dstStride;\
1988 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1989 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1990 }\
1991 \
1992 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1993 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1994 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1995 src += 8*srcStride;\
1996 dst += 8*dstStride;\
1997 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1998 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1999 }\
2000 \
2001 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2002 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2003 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2004 src += 8*srcStride;\
2005 tmp += 8*tmpStride;\
2006 dst += 8*dstStride;\
2007 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2008 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2009 }\
2010
2011 #define H264_MC(OPNAME, SIZE) \
2012 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2013 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2014 }\
2015 \
2016 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2017 uint8_t half[SIZE*SIZE];\
2018 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2019 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2020 }\
2021 \
2022 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2023 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2024 }\
2025 \
2026 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2027 uint8_t half[SIZE*SIZE];\
2028 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2029 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2030 }\
2031 \
2032 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2033 uint8_t full[SIZE*(SIZE+5)];\
2034 uint8_t * const full_mid= full + SIZE*2;\
2035 uint8_t half[SIZE*SIZE];\
2036 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2037 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2038 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2039 }\
2040 \
2041 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2042 uint8_t full[SIZE*(SIZE+5)];\
2043 uint8_t * const full_mid= full + SIZE*2;\
2044 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2045 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2046 }\
2047 \
2048 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2049 uint8_t full[SIZE*(SIZE+5)];\
2050 uint8_t * const full_mid= full + SIZE*2;\
2051 uint8_t half[SIZE*SIZE];\
2052 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2053 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2054 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2055 }\
2056 \
2057 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058 uint8_t full[SIZE*(SIZE+5)];\
2059 uint8_t * const full_mid= full + SIZE*2;\
2060 uint8_t halfH[SIZE*SIZE];\
2061 uint8_t halfV[SIZE*SIZE];\
2062 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2063 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2064 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2065 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2066 }\
2067 \
2068 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2069 uint8_t full[SIZE*(SIZE+5)];\
2070 uint8_t * const full_mid= full + SIZE*2;\
2071 uint8_t halfH[SIZE*SIZE];\
2072 uint8_t halfV[SIZE*SIZE];\
2073 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2074 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2075 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2076 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2077 }\
2078 \
2079 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2080 uint8_t full[SIZE*(SIZE+5)];\
2081 uint8_t * const full_mid= full + SIZE*2;\
2082 uint8_t halfH[SIZE*SIZE];\
2083 uint8_t halfV[SIZE*SIZE];\
2084 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2085 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2086 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2087 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2088 }\
2089 \
2090 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2091 uint8_t full[SIZE*(SIZE+5)];\
2092 uint8_t * const full_mid= full + SIZE*2;\
2093 uint8_t halfH[SIZE*SIZE];\
2094 uint8_t halfV[SIZE*SIZE];\
2095 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2096 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2097 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2098 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2099 }\
2100 \
2101 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2102 int16_t tmp[SIZE*(SIZE+5)];\
2103 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2104 }\
2105 \
2106 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2107 int16_t tmp[SIZE*(SIZE+5)];\
2108 uint8_t halfH[SIZE*SIZE];\
2109 uint8_t halfHV[SIZE*SIZE];\
2110 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2111 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2112 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2113 }\
2114 \
2115 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2116 int16_t tmp[SIZE*(SIZE+5)];\
2117 uint8_t halfH[SIZE*SIZE];\
2118 uint8_t halfHV[SIZE*SIZE];\
2119 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2120 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2121 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2122 }\
2123 \
2124 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2125 uint8_t full[SIZE*(SIZE+5)];\
2126 uint8_t * const full_mid= full + SIZE*2;\
2127 int16_t tmp[SIZE*(SIZE+5)];\
2128 uint8_t halfV[SIZE*SIZE];\
2129 uint8_t halfHV[SIZE*SIZE];\
2130 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2131 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2132 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2133 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2134 }\
2135 \
2136 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2137 uint8_t full[SIZE*(SIZE+5)];\
2138 uint8_t * const full_mid= full + SIZE*2;\
2139 int16_t tmp[SIZE*(SIZE+5)];\
2140 uint8_t halfV[SIZE*SIZE];\
2141 uint8_t halfHV[SIZE*SIZE];\
2142 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2143 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2144 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2145 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2146 }\
2147
2148 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2149 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2150 #define op_put(a, b) a = cm[((b) + 16)>>5]
2151 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2152 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2153
2154 H264_LOWPASS(put_ , op_put, op2_put)
2155 H264_LOWPASS(avg_ , op_avg, op2_avg)
2156 H264_MC(put_, 4)
2157 H264_MC(put_, 8)
2158 H264_MC(put_, 16)
2159 H264_MC(avg_, 4)
2160 H264_MC(avg_, 8)
2161 H264_MC(avg_, 16)
2162
2163 #undef op_avg
2164 #undef op_put
2165 #undef op2_avg
2166 #undef op2_put
2167 #endif
2168
2169 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2170 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2171 int i;
2172
2173 for(i=0; i<h; i++){
2174 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2175 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2176 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2177 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2178 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2179 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2180 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2181 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2182 dst+=dstStride;
2183 src+=srcStride;
2184 }
2185 }
2186
2187 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2188 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2189 int i;
2190
2191 for(i=0; i<w; i++){
2192 const int src_1= src[ -srcStride];
2193 const int src0 = src[0 ];
2194 const int src1 = src[ srcStride];
2195 const int src2 = src[2*srcStride];
2196 const int src3 = src[3*srcStride];
2197 const int src4 = src[4*srcStride];
2198 const int src5 = src[5*srcStride];
2199 const int src6 = src[6*srcStride];
2200 const int src7 = src[7*srcStride];
2201 const int src8 = src[8*srcStride];
2202 const int src9 = src[9*srcStride];
2203 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2204 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2205 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2206 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2207 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2208 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2209 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2210 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2211 src++;
2212 dst++;
2213 }
2214 }
2215
2216 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2217 put_pixels8_c(dst, src, stride, 8);
2218 }
2219
2220 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2221 uint8_t half[64];
2222 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2223 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2224 }
2225
2226 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2227 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2228 }
2229
2230 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2231 uint8_t half[64];
2232 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2233 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2234 }
2235
2236 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2237 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2238 }
2239
2240 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2241 uint8_t halfH[88];
2242 uint8_t halfV[64];
2243 uint8_t halfHV[64];
2244 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2245 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2246 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2247 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2248 }
2249 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2250 uint8_t halfH[88];
2251 uint8_t halfV[64];
2252 uint8_t halfHV[64];
2253 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2254 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2255 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2256 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2257 }
2258 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2259 uint8_t halfH[88];
2260 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2261 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2262 }
2263
2264
2265 static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2266 {
2267 int s, i;
2268
2269 s = 0;
2270 for(i=0;i<16;i++) {
2271 s += abs(pix1[0] - pix2[0]);
2272 s += abs(pix1[1] - pix2[1]);
2273 s += abs(pix1[2] - pix2[2]);
2274 s += abs(pix1[3] - pix2[3]);
2275 s += abs(pix1[4] - pix2[4]);
2276 s += abs(pix1[5] - pix2[5]);
2277 s += abs(pix1[6] - pix2[6]);
2278 s += abs(pix1[7] - pix2[7]);
2279 s += abs(pix1[8] - pix2[8]);
2280 s += abs(pix1[9] - pix2[9]);
2281 s += abs(pix1[10] - pix2[10]);
2282 s += abs(pix1[11] - pix2[11]);
2283 s += abs(pix1[12] - pix2[12]);
2284 s += abs(pix1[13] - pix2[13]);
2285 s += abs(pix1[14] - pix2[14]);
2286 s += abs(pix1[15] - pix2[15]);
2287 pix1 += line_size;
2288 pix2 += line_size;
2289 }
2290 return s;
2291 }
2292
2293 static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2294 {
2295 int s, i;
2296
2297 s = 0;
2298 for(i=0;i<16;i++) {
2299 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2300 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2301 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2302 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2303 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2304 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2305 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2306 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2307 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2308 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2309 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2310 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2311 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2312 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2313 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2314 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2315 pix1 += line_size;
2316 pix2 += line_size;
2317 }
2318 return s;
2319 }
2320
2321 static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2322 {
2323 int s, i;
2324 uint8_t *pix3 = pix2 + line_size;
2325
2326 s = 0;
2327 for(i=0;i<16;i++) {
2328 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2329 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2330 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2331 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2332 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2333 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2334 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2335 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2336 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2337 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2338 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2339 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2340 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2341 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2342 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2343 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2344 pix1 += line_size;
2345 pix2 += line_size;
2346 pix3 += line_size;
2347 }
2348 return s;
2349 }
2350
2351 static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2352 {
2353 int s, i;
2354 uint8_t *pix3 = pix2 + line_size;
2355
2356 s = 0;
2357 for(i=0;i<16;i++) {
2358 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2359 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2360 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2361 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2362 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2363 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2364 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2365 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2366 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2367 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2368 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2369 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2370 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2371 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2372 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2373 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2374 pix1 += line_size;
2375 pix2 += line_size;
2376 pix3 += line_size;
2377 }
2378 return s;
2379 }
2380
2381 static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2382 {
2383 int s, i;
2384
2385 s = 0;
2386 for(i=0;i<8;i++) {
2387 s += abs(pix1[0] - pix2[0]);
2388 s += abs(pix1[1] - pix2[1]);
2389 s += abs(pix1[2] - pix2[2]);
2390 s += abs(pix1[3] - pix2[3]);
2391 s += abs(pix1[4] - pix2[4]);
2392 s += abs(pix1[5] - pix2[5]);
2393 s += abs(pix1[6] - pix2[6]);
2394 s += abs(pix1[7] - pix2[7]);
2395 pix1 += line_size;
2396 pix2 += line_size;
2397 }
2398 return s;
2399 }
2400
2401 static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2402 {
2403 int s, i;
2404
2405 s = 0;
2406 for(i=0;i<8;i++) {
2407 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2408 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2409 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2410 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2411 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2412 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2413 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2414 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2415 pix1 += line_size;
2416 pix2 += line_size;
2417 }
2418 return s;
2419 }
2420
2421 static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2422 {
2423 int s, i;
2424 uint8_t *pix3 = pix2 + line_size;
2425
2426 s = 0;
2427 for(i=0;i<8;i++) {
2428 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2429 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2430 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2431 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2432 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2433 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2434 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2435 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2436 pix1 += line_size;
2437 pix2 += line_size;
2438 pix3 += line_size;
2439 }
2440 return s;
2441 }
2442
2443 static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2444 {
2445 int s, i;
2446 uint8_t *pix3 = pix2 + line_size;
2447
2448 s = 0;
2449 for(i=0;i<8;i++) {
2450 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2451 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2452 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2453 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2454 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2455 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2456 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2457 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2458 pix1 += line_size;
2459 pix2 += line_size;
2460 pix3 += line_size;
2461 }
2462 return s;
2463 }
2464
2465 static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2466 return pix_abs16x16_c(a,b,stride);
2467 }
2468
2469 static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2470 return pix_abs8x8_c(a,b,stride);
2471 }
2472
2473 /**
2474 * permutes an 8x8 block.
2475 * @param block the block which will be permuted according to the given permutation vector
2476 * @param permutation the permutation vector
2477 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2478 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2479 * (inverse) permutated to scantable order!
2480 */
2481 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2482 {
2483 int i;
2484 DCTELEM temp[64];
2485
2486 if(last<=0) return;
2487 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2488
2489 for(i=0; i<=last; i++){
2490 const int j= scantable[i];
2491 temp[j]= block[j];
2492 block[j]=0;
2493 }
2494
2495 for(i=0; i<=last; i++){
2496 const int j= scantable[i];
2497 const int perm_j= permutation[j];
2498 block[perm_j]= temp[j];
2499 }
2500 }
2501
2502 /**
2503 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2504 */
2505 static void clear_blocks_c(DCTELEM *blocks)
2506 {
2507 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2508 }
2509
2510 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2511 int i;
2512 for(i=0; i+7<w; i+=8){
2513 dst[i+0] += src[i+0];
2514 dst[i+1] += src[i+1];
2515 dst[i+2] += src[i+2];
2516 dst[i+3] += src[i+3];
2517 dst[i+4] += src[i+4];
2518 dst[i+5] += src[i+5];
2519 dst[i+6] += src[i+6];
2520 dst[i+7] += src[i+7];
2521 }
2522 for(; i<w; i++)
2523 dst[i+0] += src[i+0];
2524 }
2525
2526 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2527 int i;
2528 for(i=0; i+7<w; i+=8){
2529 dst[i+0] = src1[i+0]-src2[i+0];
2530 dst[i+1] = src1[i+1]-src2[i+1];
2531 dst[i+2] = src1[i+2]-src2[i+2];
2532 dst[i+3] = src1[i+3]-src2[i+3];
2533 dst[i+4] = src1[i+4]-src2[i+4];
2534 dst[i+5] = src1[i+5]-src2[i+5];
2535 dst[i+6] = src1[i+6]-src2[i+6];
2536 dst[i+7] = src1[i+7]-src2[i+7];
2537 }
2538 for(; i<w; i++)
2539 dst[i+0] = src1[i+0]-src2[i+0];
2540 }
2541
2542 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2543 int i;
2544 uint8_t l, lt;
2545
2546 l= *left;
2547 lt= *left_top;
2548
2549 for(i=0; i<w; i++){
2550 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2551 lt= src1[i];
2552 l= src2[i];
2553 dst[i]= l - pred;
2554 }
2555
2556 *left= l;
2557 *left_top= lt;
2558 }
2559
2560 #define BUTTERFLY2(o1,o2,i1,i2) \
2561 o1= (i1)+(i2);\
2562 o2= (i1)-(i2);
2563
2564 #define BUTTERFLY1(x,y) \
2565 {\
2566 int a,b;\
2567 a= x;\
2568 b= y;\
2569 x= a+b;\
2570 y= a-b;\
2571 }
2572
2573 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2574
2575 static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2576 int i;
2577 int temp[64];
2578 int sum=0;
2579
2580 for(i=0; i<8; i++){
2581 //FIXME try pointer walks
2582 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2583 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2584 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2585 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2586
2587 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2588 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2589 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2590 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2591
2592 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2593 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2594 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2595 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2596 }
2597
2598 for(i=0; i<8; i++){
2599 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2600 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2601 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2602 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2603
2604 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2605 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2606 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2607 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2608
2609 sum +=
2610 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2611 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2612 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2613 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2614 }
2615 #if 0
2616 static int maxi=0;
2617 if(sum>maxi){
2618 maxi=sum;
2619 printf("MAX:%d\n", maxi);
2620 }
2621 #endif
2622 return sum;
2623 }
2624
2625 static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2626 int i;
2627 int temp[64];
2628 int sum=0;
2629 //FIXME OOOPS ignore 0 term instead of mean mess
2630 for(i=0; i<8; i++){
2631 //FIXME try pointer walks
2632 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2633 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2634 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2635 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2636
2637 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2638 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2639 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2640 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2641
2642 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2643 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2644 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2645 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2646 }
2647
2648 for(i=0; i<8; i++){
2649 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2650 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2651 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2652 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2653
2654 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2655 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2656 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2657 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2658
2659 sum +=
2660 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2661 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2662 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2663 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2664 }
2665
2666 return sum;
2667 }
2668
2669 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2670 MpegEncContext * const s= (MpegEncContext *)c;
2671 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2672 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2673 int sum=0, i;
2674
2675 s->dsp.diff_pixels(temp, src1, src2, stride);
2676 s->dsp.fdct(temp);
2677
2678 for(i=0; i<64; i++)
2679 sum+= ABS(temp[i]);
2680
2681 return sum;
2682 }
2683
2684 void simple_idct(DCTELEM *block); //FIXME
2685
2686 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2687 MpegEncContext * const s= (MpegEncContext *)c;
2688 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2689 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2690 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2691 int sum=0, i;
2692
2693 s->mb_intra=0;
2694
2695 s->dsp.diff_pixels(temp, src1, src2, stride);
2696
2697 memcpy(bak, temp, 64*sizeof(DCTELEM));
2698
2699 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2700 s->dct_unquantize(s, temp, 0, s->qscale);
2701 simple_idct(temp); //FIXME
2702
2703 for(i=0; i<64; i++)
2704 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2705
2706 return sum;
2707 }
2708
2709 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2710 MpegEncContext * const s= (MpegEncContext *)c;
2711 const uint8_t *scantable= s->intra_scantable.permutated;
2712 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2713 uint64_t __align8 aligned_bak[stride];
2714 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2715 uint8_t * const bak= (uint8_t*)aligned_bak;
2716 int i, last, run, bits, level, distoration, start_i;
2717 const int esc_length= s->ac_esc_length;
2718 uint8_t * length;
2719 uint8_t * last_length;
2720
2721 for(i=0; i<8; i++){
2722 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2723 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2724 }
2725
2726 s->dsp.diff_pixels(temp, src1, src2, stride);
2727
2728 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2729
2730 bits=0;
2731
2732 if (s->mb_intra) {
2733 start_i = 1;
2734 length = s->intra_ac_vlc_length;
2735 last_length= s->intra_ac_vlc_last_length;
2736 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2737 } else {
2738 start_i = 0;
2739 length = s->inter_ac_vlc_length;
2740 last_length= s->inter_ac_vlc_last_length;
2741 }
2742
2743 if(last>=start_i){
2744 run=0;
2745 for(i=start_i; i<last; i++){
2746 int j= scantable[i];
2747 level= temp[j];
2748
2749 if(level){
2750 level+=64;
2751 if((level&(~127)) == 0){
2752 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2753 }else
2754 bits+= esc_length;
2755 run=0;
2756 }else
2757 run++;
2758 }
2759 i= scantable[last];
2760
2761 level= temp[i] + 64;
2762
2763 assert(level - 64);
2764
2765 if((level&(~127)) == 0){
2766 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2767 }else
2768 bits+= esc_length;
2769
2770 }
2771
2772 if(last>=0){
2773 s->dct_unquantize(s, temp, 0, s->qscale);
2774 }
2775
2776 s->dsp.idct_add(bak, stride, temp);
2777
2778 distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2779
2780 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2781 }
2782
2783 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2784 MpegEncContext * const s= (MpegEncContext *)c;
2785 const uint8_t *scantable= s->intra_scantable.permutated;
2786 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2787 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2788 int i, last, run, bits, level, start_i;
2789 const int esc_length= s->ac_esc_length;
2790 uint8_t * length;
2791 uint8_t * last_length;
2792
2793 s->dsp.diff_pixels(temp, src1, src2, stride);
2794
2795 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2796
2797 bits=0;
2798
2799 if (s->mb_intra) {
2800 start_i = 1;
2801 length = s->intra_ac_vlc_length;
2802 last_length= s->intra_ac_vlc_last_length;
2803 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2804 } else {
2805 start_i = 0;
2806 length = s->inter_ac_vlc_length;
2807 last_length= s->inter_ac_vlc_last_length;
2808 }
2809
2810 if(last>=start_i){
2811 run=0;
2812 for(i=start_i; i<last; i++){
2813 int j= scantable[i];
2814 level= temp[j];
2815
2816 if(level){
2817 level+=64;
2818 if((level&(~127)) == 0){
2819 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2820 }else
2821 bits+= esc_length;
2822 run=0;
2823 }else
2824 run++;
2825 }
2826 i= scantable[last];
2827
2828 level= temp[i] + 64;
2829
2830 assert(level - 64);
2831
2832 if((level&(~127)) == 0){
2833 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2834 }else
2835 bits+= esc_length;
2836 }
2837
2838 return bits;
2839 }
2840
2841
2842 WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2843 WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2844 WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2845 WARPER88_1616(rd8x8_c, rd16x16_c)
2846 WARPER88_1616(bit8x8_c, bit16x16_c)
2847
2848 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2849 converted */
2850 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2851 {
2852 j_rev_dct (block);
2853 put_pixels_clamped_c(block, dest, line_size);
2854 }
2855 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2856 {
2857 j_rev_dct (block);
2858 add_pixels_clamped_c(block, dest, line_size);
2859 }
2860
2861 /* init static data */
2862 void dsputil_static_init(void)
2863 {
2864 int i;
2865
2866 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2867 for(i=0;i<MAX_NEG_CROP;i++) {
2868 cropTbl[i] = 0;
2869 cropTbl[i + MAX_NEG_CROP + 256] = 255;
2870 }
2871
2872 for(i=0;i<512;i++) {
2873 squareTbl[i] = (i - 256) * (i - 256);
2874 }
2875
2876 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2877 }
2878
2879
2880 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2881 {
2882 int i;
2883
2884 #ifdef CONFIG_ENCODERS
2885 if(avctx->dct_algo==FF_DCT_FASTINT) {
2886 c->fdct = fdct_ifast;
2887 c->fdct248 = ff_fdct248_islow; // FIXME: need an optimized version
2888 }
2889 else if(avctx->dct_algo==FF_DCT_FAAN) {
2890 c->fdct = ff_faandct;
2891 c->fdct248 = ff_fdct248_islow; // FIXME: need an optimized version
2892 }
2893 else {
2894 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2895 c->fdct248 = ff_fdct248_islow;
2896 }
2897 #endif //CONFIG_ENCODERS
2898
2899 if(avctx->idct_algo==FF_IDCT_INT){
2900 c->idct_put= ff_jref_idct_put;
2901 c->idct_add= ff_jref_idct_add;
2902 c->idct = j_rev_dct;
2903 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2904 }else{ //accurate/default
2905 c->idct_put= simple_idct_put;
2906 c->idct_add= simple_idct_add;
2907 c->idct = simple_idct;
2908 c->idct_permutation_type= FF_NO_IDCT_PERM;
2909 }
2910
2911 c->get_pixels = get_pixels_c;
2912 c->diff_pixels = diff_pixels_c;
2913 c->put_pixels_clamped = put_pixels_clamped_c;
2914 c->add_pixels_clamped = add_pixels_clamped_c;
2915 c->gmc1 = gmc1_c;
2916 c->gmc = gmc_c;
2917 c->clear_blocks = clear_blocks_c;
2918 c->pix_sum = pix_sum_c;
2919 c->pix_norm1 = pix_norm1_c;
2920 c->sse[0]= sse16_c;
2921 c->sse[1]= sse8_c;
2922
2923 /* TODO [0] 16 [1] 8 */
2924 c->pix_abs16x16 = pix_abs16x16_c;
2925 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
2926 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
2927 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2928 c->pix_abs8x8 = pix_abs8x8_c;
2929 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
2930 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
2931 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2932
2933 #define dspfunc(PFX, IDX, NUM) \
2934 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
2935 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
2936 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
2937 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2938
2939 dspfunc(put, 0, 16);
2940 dspfunc(put_no_rnd, 0, 16);
2941 dspfunc(put, 1, 8);
2942 dspfunc(put_no_rnd, 1, 8);
2943 dspfunc(put, 2, 4);
2944 dspfunc(put, 3, 2);
2945
2946 dspfunc(avg, 0, 16);
2947 dspfunc(avg_no_rnd, 0, 16);
2948 dspfunc(avg, 1, 8);
2949 dspfunc(avg_no_rnd, 1, 8);
2950 dspfunc(avg, 2, 4);
2951 dspfunc(avg, 3, 2);
2952 #undef dspfunc
2953
2954 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2955 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2956 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2957 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2958 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2959 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2960 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2961 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2962 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2963
2964 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2965 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2966 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2967 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2968 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2969 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2970 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2971 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2972 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2973
2974 #define dspfunc(PFX, IDX, NUM) \
2975 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2976 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2977 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2978 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2979 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2980 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2981 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2982 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2983 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2984 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2985 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2986 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2987 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2988 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2989 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2990 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2991
2992 dspfunc(put_qpel, 0, 16);
2993 dspfunc(put_no_rnd_qpel, 0, 16);
2994
2995 dspfunc(avg_qpel, 0, 16);
2996 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2997
2998 dspfunc(put_qpel, 1, 8);
2999 dspfunc(put_no_rnd_qpel, 1, 8);
3000
3001 dspfunc(avg_qpel, 1, 8);
3002 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3003
3004 dspfunc(put_h264_qpel, 0, 16);
3005 dspfunc(put_h264_qpel, 1, 8);
3006 dspfunc(put_h264_qpel, 2, 4);
3007 dspfunc(avg_h264_qpel, 0, 16);
3008 dspfunc(avg_h264_qpel, 1, 8);
3009 dspfunc(avg_h264_qpel, 2, 4);
3010
3011 #undef dspfunc
3012 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3013 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3014 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3015 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3016 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3017 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3018
3019 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3020 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;