reorganize and simplify the VP3 IDCT stuff
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
32 #include "faandct.h"
33
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35 uint32_t squareTbl[512];
36
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46 };
47
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59 };
60
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64];
63
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73 };
74
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84 };
85
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120 };
121
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 };
133
134 static int pix_sum_c(uint8_t * pix, int line_size)
135 {
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154 }
155
156 static int pix_norm1_c(uint8_t * pix, int line_size)
157 {
158 int s, i, j;
159 uint32_t *sq = squareTbl + 256;
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
164 #if 0
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
173 #else
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184 #else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195 #endif
196 #endif
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202 }
203
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220 }
221
222 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223 {
224 int s, i;
225 uint32_t *sq = squareTbl + 256;
226
227 s = 0;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 s += sq[pix1[4] - pix2[4]];
234 s += sq[pix1[5] - pix2[5]];
235 s += sq[pix1[6] - pix2[6]];
236 s += sq[pix1[7] - pix2[7]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241 }
242
243 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
244 {
245 int s, i;
246 uint32_t *sq = squareTbl + 256;
247
248 s = 0;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[ 0] - pix2[ 0]];
251 s += sq[pix1[ 1] - pix2[ 1]];
252 s += sq[pix1[ 2] - pix2[ 2]];
253 s += sq[pix1[ 3] - pix2[ 3]];
254 s += sq[pix1[ 4] - pix2[ 4]];
255 s += sq[pix1[ 5] - pix2[ 5]];
256 s += sq[pix1[ 6] - pix2[ 6]];
257 s += sq[pix1[ 7] - pix2[ 7]];
258 s += sq[pix1[ 8] - pix2[ 8]];
259 s += sq[pix1[ 9] - pix2[ 9]];
260 s += sq[pix1[10] - pix2[10]];
261 s += sq[pix1[11] - pix2[11]];
262 s += sq[pix1[12] - pix2[12]];
263 s += sq[pix1[13] - pix2[13]];
264 s += sq[pix1[14] - pix2[14]];
265 s += sq[pix1[15] - pix2[15]];
266
267 pix1 += line_size;
268 pix2 += line_size;
269 }
270 return s;
271 }
272
273 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
274 {
275 int i;
276
277 /* read the pixels */
278 for(i=0;i<8;i++) {
279 block[0] = pixels[0];
280 block[1] = pixels[1];
281 block[2] = pixels[2];
282 block[3] = pixels[3];
283 block[4] = pixels[4];
284 block[5] = pixels[5];
285 block[6] = pixels[6];
286 block[7] = pixels[7];
287 pixels += line_size;
288 block += 8;
289 }
290 }
291
292 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293 const uint8_t *s2, int stride){
294 int i;
295
296 /* read the pixels */
297 for(i=0;i<8;i++) {
298 block[0] = s1[0] - s2[0];
299 block[1] = s1[1] - s2[1];
300 block[2] = s1[2] - s2[2];
301 block[3] = s1[3] - s2[3];
302 block[4] = s1[4] - s2[4];
303 block[5] = s1[5] - s2[5];
304 block[6] = s1[6] - s2[6];
305 block[7] = s1[7] - s2[7];
306 s1 += stride;
307 s2 += stride;
308 block += 8;
309 }
310 }
311
312
313 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
314 int line_size)
315 {
316 int i;
317 uint8_t *cm = cropTbl + MAX_NEG_CROP;
318
319 /* read the pixels */
320 for(i=0;i<8;i++) {
321 pixels[0] = cm[block[0]];
322 pixels[1] = cm[block[1]];
323 pixels[2] = cm[block[2]];
324 pixels[3] = cm[block[3]];
325 pixels[4] = cm[block[4]];
326 pixels[5] = cm[block[5]];
327 pixels[6] = cm[block[6]];
328 pixels[7] = cm[block[7]];
329
330 pixels += line_size;
331 block += 8;
332 }
333 }
334
335 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
336 int line_size)
337 {
338 int i;
339 uint8_t *cm = cropTbl + MAX_NEG_CROP;
340
341 /* read the pixels */
342 for(i=0;i<8;i++) {
343 pixels[0] = cm[pixels[0] + block[0]];
344 pixels[1] = cm[pixels[1] + block[1]];
345 pixels[2] = cm[pixels[2] + block[2]];
346 pixels[3] = cm[pixels[3] + block[3]];
347 pixels[4] = cm[pixels[4] + block[4]];
348 pixels[5] = cm[pixels[5] + block[5]];
349 pixels[6] = cm[pixels[6] + block[6]];
350 pixels[7] = cm[pixels[7] + block[7]];
351 pixels += line_size;
352 block += 8;
353 }
354 }
355 #if 0
356
357 #define PIXOP2(OPNAME, OP) \
358 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
359 {\
360 int i;\
361 for(i=0; i<h; i++){\
362 OP(*((uint64_t*)block), LD64(pixels));\
363 pixels+=line_size;\
364 block +=line_size;\
365 }\
366 }\
367 \
368 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
369 {\
370 int i;\
371 for(i=0; i<h; i++){\
372 const uint64_t a= LD64(pixels );\
373 const uint64_t b= LD64(pixels+1);\
374 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
375 pixels+=line_size;\
376 block +=line_size;\
377 }\
378 }\
379 \
380 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
381 {\
382 int i;\
383 for(i=0; i<h; i++){\
384 const uint64_t a= LD64(pixels );\
385 const uint64_t b= LD64(pixels+1);\
386 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
387 pixels+=line_size;\
388 block +=line_size;\
389 }\
390 }\
391 \
392 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
393 {\
394 int i;\
395 for(i=0; i<h; i++){\
396 const uint64_t a= LD64(pixels );\
397 const uint64_t b= LD64(pixels+line_size);\
398 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
399 pixels+=line_size;\
400 block +=line_size;\
401 }\
402 }\
403 \
404 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
405 {\
406 int i;\
407 for(i=0; i<h; i++){\
408 const uint64_t a= LD64(pixels );\
409 const uint64_t b= LD64(pixels+line_size);\
410 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
411 pixels+=line_size;\
412 block +=line_size;\
413 }\
414 }\
415 \
416 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
417 {\
418 int i;\
419 const uint64_t a= LD64(pixels );\
420 const uint64_t b= LD64(pixels+1);\
421 uint64_t l0= (a&0x0303030303030303ULL)\
422 + (b&0x0303030303030303ULL)\
423 + 0x0202020202020202ULL;\
424 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
425 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
426 uint64_t l1,h1;\
427 \
428 pixels+=line_size;\
429 for(i=0; i<h; i+=2){\
430 uint64_t a= LD64(pixels );\
431 uint64_t b= LD64(pixels+1);\
432 l1= (a&0x0303030303030303ULL)\
433 + (b&0x0303030303030303ULL);\
434 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
435 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
436 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
437 pixels+=line_size;\
438 block +=line_size;\
439 a= LD64(pixels );\
440 b= LD64(pixels+1);\
441 l0= (a&0x0303030303030303ULL)\
442 + (b&0x0303030303030303ULL)\
443 + 0x0202020202020202ULL;\
444 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
445 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
446 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
447 pixels+=line_size;\
448 block +=line_size;\
449 }\
450 }\
451 \
452 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
453 {\
454 int i;\
455 const uint64_t a= LD64(pixels );\
456 const uint64_t b= LD64(pixels+1);\
457 uint64_t l0= (a&0x0303030303030303ULL)\
458 + (b&0x0303030303030303ULL)\
459 + 0x0101010101010101ULL;\
460 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
461 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
462 uint64_t l1,h1;\
463 \
464 pixels+=line_size;\
465 for(i=0; i<h; i+=2){\
466 uint64_t a= LD64(pixels );\
467 uint64_t b= LD64(pixels+1);\
468 l1= (a&0x0303030303030303ULL)\
469 + (b&0x0303030303030303ULL);\
470 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
471 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
472 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
473 pixels+=line_size;\
474 block +=line_size;\
475 a= LD64(pixels );\
476 b= LD64(pixels+1);\
477 l0= (a&0x0303030303030303ULL)\
478 + (b&0x0303030303030303ULL)\
479 + 0x0101010101010101ULL;\
480 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
481 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
482 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
483 pixels+=line_size;\
484 block +=line_size;\
485 }\
486 }\
487 \
488 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
489 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
490 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
491 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
492 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
494 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
495
496 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
497 #else // 64 bit variant
498
499 #define PIXOP2(OPNAME, OP) \
500 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
501 int i;\
502 for(i=0; i<h; i++){\
503 OP(*((uint16_t*)(block )), LD16(pixels ));\
504 pixels+=line_size;\
505 block +=line_size;\
506 }\
507 }\
508 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
509 int i;\
510 for(i=0; i<h; i++){\
511 OP(*((uint32_t*)(block )), LD32(pixels ));\
512 pixels+=line_size;\
513 block +=line_size;\
514 }\
515 }\
516 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
517 int i;\
518 for(i=0; i<h; i++){\
519 OP(*((uint32_t*)(block )), LD32(pixels ));\
520 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
521 pixels+=line_size;\
522 block +=line_size;\
523 }\
524 }\
525 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
526 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
527 }\
528 \
529 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530 int src_stride1, int src_stride2, int h){\
531 int i;\
532 for(i=0; i<h; i++){\
533 uint32_t a,b;\
534 a= LD32(&src1[i*src_stride1 ]);\
535 b= LD32(&src2[i*src_stride2 ]);\
536 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
537 a= LD32(&src1[i*src_stride1+4]);\
538 b= LD32(&src2[i*src_stride2+4]);\
539 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
540 }\
541 }\
542 \
543 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544 int src_stride1, int src_stride2, int h){\
545 int i;\
546 for(i=0; i<h; i++){\
547 uint32_t a,b;\
548 a= LD32(&src1[i*src_stride1 ]);\
549 b= LD32(&src2[i*src_stride2 ]);\
550 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
551 a= LD32(&src1[i*src_stride1+4]);\
552 b= LD32(&src2[i*src_stride2+4]);\
553 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
554 }\
555 }\
556 \
557 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
558 int src_stride1, int src_stride2, int h){\
559 int i;\
560 for(i=0; i<h; i++){\
561 uint32_t a,b;\
562 a= LD32(&src1[i*src_stride1 ]);\
563 b= LD32(&src2[i*src_stride2 ]);\
564 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
565 }\
566 }\
567 \
568 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
569 int src_stride1, int src_stride2, int h){\
570 int i;\
571 for(i=0; i<h; i++){\
572 uint32_t a,b;\
573 a= LD16(&src1[i*src_stride1 ]);\
574 b= LD16(&src2[i*src_stride2 ]);\
575 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
576 }\
577 }\
578 \
579 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
580 int src_stride1, int src_stride2, int h){\
581 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
582 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
583 }\
584 \
585 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
586 int src_stride1, int src_stride2, int h){\
587 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
588 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
589 }\
590 \
591 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
592 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
593 }\
594 \
595 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
596 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
597 }\
598 \
599 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
600 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
601 }\
602 \
603 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
604 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
605 }\
606 \
607 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
608 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
609 int i;\
610 for(i=0; i<h; i++){\
611 uint32_t a, b, c, d, l0, l1, h0, h1;\
612 a= LD32(&src1[i*src_stride1]);\
613 b= LD32(&src2[i*src_stride2]);\
614 c= LD32(&src3[i*src_stride3]);\
615 d= LD32(&src4[i*src_stride4]);\
616 l0= (a&0x03030303UL)\
617 + (b&0x03030303UL)\
618 + 0x02020202UL;\
619 h0= ((a&0xFCFCFCFCUL)>>2)\
620 + ((b&0xFCFCFCFCUL)>>2);\
621 l1= (c&0x03030303UL)\
622 + (d&0x03030303UL);\
623 h1= ((c&0xFCFCFCFCUL)>>2)\
624 + ((d&0xFCFCFCFCUL)>>2);\
625 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
626 a= LD32(&src1[i*src_stride1+4]);\
627 b= LD32(&src2[i*src_stride2+4]);\
628 c= LD32(&src3[i*src_stride3+4]);\
629 d= LD32(&src4[i*src_stride4+4]);\
630 l0= (a&0x03030303UL)\
631 + (b&0x03030303UL)\
632 + 0x02020202UL;\
633 h0= ((a&0xFCFCFCFCUL)>>2)\
634 + ((b&0xFCFCFCFCUL)>>2);\
635 l1= (c&0x03030303UL)\
636 + (d&0x03030303UL);\
637 h1= ((c&0xFCFCFCFCUL)>>2)\
638 + ((d&0xFCFCFCFCUL)>>2);\
639 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
640 }\
641 }\
642 \
643 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
644 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
645 }\
646 \
647 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
648 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
649 }\
650 \
651 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
652 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
653 }\
654 \
655 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
656 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
657 }\
658 \
659 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
660 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
661 int i;\
662 for(i=0; i<h; i++){\
663 uint32_t a, b, c, d, l0, l1, h0, h1;\
664 a= LD32(&src1[i*src_stride1]);\
665 b= LD32(&src2[i*src_stride2]);\
666 c= LD32(&src3[i*src_stride3]);\
667 d= LD32(&src4[i*src_stride4]);\
668 l0= (a&0x03030303UL)\
669 + (b&0x03030303UL)\
670 + 0x01010101UL;\
671 h0= ((a&0xFCFCFCFCUL)>>2)\
672 + ((b&0xFCFCFCFCUL)>>2);\
673 l1= (c&0x03030303UL)\
674 + (d&0x03030303UL);\
675 h1= ((c&0xFCFCFCFCUL)>>2)\
676 + ((d&0xFCFCFCFCUL)>>2);\
677 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
678 a= LD32(&src1[i*src_stride1+4]);\
679 b= LD32(&src2[i*src_stride2+4]);\
680 c= LD32(&src3[i*src_stride3+4]);\
681 d= LD32(&src4[i*src_stride4+4]);\
682 l0= (a&0x03030303UL)\
683 + (b&0x03030303UL)\
684 + 0x01010101UL;\
685 h0= ((a&0xFCFCFCFCUL)>>2)\
686 + ((b&0xFCFCFCFCUL)>>2);\
687 l1= (c&0x03030303UL)\
688 + (d&0x03030303UL);\
689 h1= ((c&0xFCFCFCFCUL)>>2)\
690 + ((d&0xFCFCFCFCUL)>>2);\
691 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
692 }\
693 }\
694 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
695 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
696 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
698 }\
699 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
700 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
701 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
703 }\
704 \
705 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
706 {\
707 int i, a0, b0, a1, b1;\
708 a0= pixels[0];\
709 b0= pixels[1] + 2;\
710 a0 += b0;\
711 b0 += pixels[2];\
712 \
713 pixels+=line_size;\
714 for(i=0; i<h; i+=2){\
715 a1= pixels[0];\
716 b1= pixels[1];\
717 a1 += b1;\
718 b1 += pixels[2];\
719 \
720 block[0]= (a1+a0)>>2; /* FIXME non put */\
721 block[1]= (b1+b0)>>2;\
722 \
723 pixels+=line_size;\
724 block +=line_size;\
725 \
726 a0= pixels[0];\
727 b0= pixels[1] + 2;\
728 a0 += b0;\
729 b0 += pixels[2];\
730 \
731 block[0]= (a1+a0)>>2;\
732 block[1]= (b1+b0)>>2;\
733 pixels+=line_size;\
734 block +=line_size;\
735 }\
736 }\
737 \
738 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
739 {\
740 int i;\
741 const uint32_t a= LD32(pixels );\
742 const uint32_t b= LD32(pixels+1);\
743 uint32_t l0= (a&0x03030303UL)\
744 + (b&0x03030303UL)\
745 + 0x02020202UL;\
746 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
747 + ((b&0xFCFCFCFCUL)>>2);\
748 uint32_t l1,h1;\
749 \
750 pixels+=line_size;\
751 for(i=0; i<h; i+=2){\
752 uint32_t a= LD32(pixels );\
753 uint32_t b= LD32(pixels+1);\
754 l1= (a&0x03030303UL)\
755 + (b&0x03030303UL);\
756 h1= ((a&0xFCFCFCFCUL)>>2)\
757 + ((b&0xFCFCFCFCUL)>>2);\
758 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
759 pixels+=line_size;\
760 block +=line_size;\
761 a= LD32(pixels );\
762 b= LD32(pixels+1);\
763 l0= (a&0x03030303UL)\
764 + (b&0x03030303UL)\
765 + 0x02020202UL;\
766 h0= ((a&0xFCFCFCFCUL)>>2)\
767 + ((b&0xFCFCFCFCUL)>>2);\
768 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
769 pixels+=line_size;\
770 block +=line_size;\
771 }\
772 }\
773 \
774 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
775 {\
776 int j;\
777 for(j=0; j<2; j++){\
778 int i;\
779 const uint32_t a= LD32(pixels );\
780 const uint32_t b= LD32(pixels+1);\
781 uint32_t l0= (a&0x03030303UL)\
782 + (b&0x03030303UL)\
783 + 0x02020202UL;\
784 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
785 + ((b&0xFCFCFCFCUL)>>2);\
786 uint32_t l1,h1;\
787 \
788 pixels+=line_size;\
789 for(i=0; i<h; i+=2){\
790 uint32_t a= LD32(pixels );\
791 uint32_t b= LD32(pixels+1);\
792 l1= (a&0x03030303UL)\
793 + (b&0x03030303UL);\
794 h1= ((a&0xFCFCFCFCUL)>>2)\
795 + ((b&0xFCFCFCFCUL)>>2);\
796 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
797 pixels+=line_size;\
798 block +=line_size;\
799 a= LD32(pixels );\
800 b= LD32(pixels+1);\
801 l0= (a&0x03030303UL)\
802 + (b&0x03030303UL)\
803 + 0x02020202UL;\
804 h0= ((a&0xFCFCFCFCUL)>>2)\
805 + ((b&0xFCFCFCFCUL)>>2);\
806 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
807 pixels+=line_size;\
808 block +=line_size;\
809 }\
810 pixels+=4-line_size*(h+1);\
811 block +=4-line_size*h;\
812 }\
813 }\
814 \
815 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
816 {\
817 int j;\
818 for(j=0; j<2; j++){\
819 int i;\
820 const uint32_t a= LD32(pixels );\
821 const uint32_t b= LD32(pixels+1);\
822 uint32_t l0= (a&0x03030303UL)\
823 + (b&0x03030303UL)\
824 + 0x01010101UL;\
825 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
826 + ((b&0xFCFCFCFCUL)>>2);\
827 uint32_t l1,h1;\
828 \
829 pixels+=line_size;\
830 for(i=0; i<h; i+=2){\
831 uint32_t a= LD32(pixels );\
832 uint32_t b= LD32(pixels+1);\
833 l1= (a&0x03030303UL)\
834 + (b&0x03030303UL);\
835 h1= ((a&0xFCFCFCFCUL)>>2)\
836 + ((b&0xFCFCFCFCUL)>>2);\
837 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
838 pixels+=line_size;\
839 block +=line_size;\
840 a= LD32(pixels );\
841 b= LD32(pixels+1);\
842 l0= (a&0x03030303UL)\
843 + (b&0x03030303UL)\
844 + 0x01010101UL;\
845 h0= ((a&0xFCFCFCFCUL)>>2)\
846 + ((b&0xFCFCFCFCUL)>>2);\
847 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
848 pixels+=line_size;\
849 block +=line_size;\
850 }\
851 pixels+=4-line_size*(h+1);\
852 block +=4-line_size*h;\
853 }\
854 }\
855 \
856 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
857 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
858 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
859 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
860 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
863 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
864
865 #define op_avg(a, b) a = rnd_avg32(a, b)
866 #endif
867 #define op_put(a, b) a = b
868
869 PIXOP2(avg, op_avg)
870 PIXOP2(put, op_put)
871 #undef op_avg
872 #undef op_put
873
874 #define avg2(a,b) ((a+b+1)>>1)
875 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
876
877 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
878 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
879 }
880
881 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
882 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
883 }
884
885 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
886 {
887 const int A=(16-x16)*(16-y16);
888 const int B=( x16)*(16-y16);
889 const int C=(16-x16)*( y16);
890 const int D=( x16)*( y16);
891 int i;
892
893 for(i=0; i<h; i++)
894 {
895 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
896 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
897 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
898 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
899 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
900 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
901 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
902 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
903 dst+= stride;
904 src+= stride;
905 }
906 }
907
908 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
909 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
910 {
911 int y, vx, vy;
912 const int s= 1<<shift;
913
914 width--;
915 height--;
916
917 for(y=0; y<h; y++){
918 int x;
919
920 vx= ox;
921 vy= oy;
922 for(x=0; x<8; x++){ //XXX FIXME optimize
923 int src_x, src_y, frac_x, frac_y, index;
924
925 src_x= vx>>16;
926 src_y= vy>>16;
927 frac_x= src_x&(s-1);
928 frac_y= src_y&(s-1);
929 src_x>>=shift;
930 src_y>>=shift;
931
932 if((unsigned)src_x < width){
933 if((unsigned)src_y < height){
934 index= src_x + src_y*stride;
935 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
936 + src[index +1]* frac_x )*(s-frac_y)
937 + ( src[index+stride ]*(s-frac_x)
938 + src[index+stride+1]* frac_x )* frac_y
939 + r)>>(shift*2);
940 }else{
941 index= src_x + clip(src_y, 0, height)*stride;
942 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
943 + src[index +1]* frac_x )*s
944 + r)>>(shift*2);
945 }
946 }else{
947 if((unsigned)src_y < height){
948 index= clip(src_x, 0, width) + src_y*stride;
949 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
950 + src[index+stride ]* frac_y )*s
951 + r)>>(shift*2);
952 }else{
953 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
954 dst[y*stride + x]= src[index ];
955 }
956 }
957
958 vx+= dxx;
959 vy+= dyx;
960 }
961 ox += dxy;
962 oy += dyy;
963 }
964 }
965
966 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
967 switch(width){
968 case 2: put_pixels2_c (dst, src, stride, height); break;
969 case 4: put_pixels4_c (dst, src, stride, height); break;
970 case 8: put_pixels8_c (dst, src, stride, height); break;
971 case 16:put_pixels16_c(dst, src, stride, height); break;
972 }
973 }
974
975 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
976 int i,j;
977 for (i=0; i < height; i++) {
978 for (j=0; j < width; j++) {
979 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
980 }
981 src += stride;
982 dst += stride;
983 }
984 }
985
986 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
987 int i,j;
988 for (i=0; i < height; i++) {
989 for (j=0; j < width; j++) {
990 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
991 }
992 src += stride;
993 dst += stride;
994 }
995 }
996
997 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
998 int i,j;
999 for (i=0; i < height; i++) {
1000 for (j=0; j < width; j++) {
1001 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1002 }
1003 src += stride;
1004 dst += stride;
1005 }
1006 }
1007
1008 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1009 int i,j;
1010 for (i=0; i < height; i++) {
1011 for (j=0; j < width; j++) {
1012 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1013 }
1014 src += stride;
1015 dst += stride;
1016 }
1017 }
1018
1019 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1020 int i,j;
1021 for (i=0; i < height; i++) {
1022 for (j=0; j < width; j++) {
1023 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1024 }
1025 src += stride;
1026 dst += stride;
1027 }
1028 }
1029
1030 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1031 int i,j;
1032 for (i=0; i < height; i++) {
1033 for (j=0; j < width; j++) {
1034 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1035 }
1036 src += stride;
1037 dst += stride;
1038 }
1039 }
1040
1041 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1042 int i,j;
1043 for (i=0; i < height; i++) {
1044 for (j=0; j < width; j++) {
1045 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1046 }
1047 src += stride;
1048 dst += stride;
1049 }
1050 }
1051
1052 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1053 int i,j;
1054 for (i=0; i < height; i++) {
1055 for (j=0; j < width; j++) {
1056 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1057 }
1058 src += stride;
1059 dst += stride;
1060 }
1061 }
1062
1063 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1064 switch(width){
1065 case 2: avg_pixels2_c (dst, src, stride, height); break;
1066 case 4: avg_pixels4_c (dst, src, stride, height); break;
1067 case 8: avg_pixels8_c (dst, src, stride, height); break;
1068 case 16:avg_pixels16_c(dst, src, stride, height); break;
1069 }
1070 }
1071
1072 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1073 int i,j;
1074 for (i=0; i < height; i++) {
1075 for (j=0; j < width; j++) {
1076 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1077 }
1078 src += stride;
1079 dst += stride;
1080 }
1081 }
1082
1083 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1084 int i,j;
1085 for (i=0; i < height; i++) {
1086 for (j=0; j < width; j++) {
1087 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1088 }
1089 src += stride;
1090 dst += stride;
1091 }
1092 }
1093
1094 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1095 int i,j;
1096 for (i=0; i < height; i++) {
1097 for (j=0; j < width; j++) {
1098 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1099 }
1100 src += stride;
1101 dst += stride;
1102 }
1103 }
1104
1105 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1106 int i,j;
1107 for (i=0; i < height; i++) {
1108 for (j=0; j < width; j++) {
1109 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1110 }
1111 src += stride;
1112 dst += stride;
1113 }
1114 }
1115
1116 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1117 int i,j;
1118 for (i=0; i < height; i++) {
1119 for (j=0; j < width; j++) {
1120 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1121 }
1122 src += stride;
1123 dst += stride;
1124 }
1125 }
1126
1127 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1128 int i,j;
1129 for (i=0; i < height; i++) {
1130 for (j=0; j < width; j++) {
1131 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1132 }
1133 src += stride;
1134 dst += stride;
1135 }
1136 }
1137
1138 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1139 int i,j;
1140 for (i=0; i < height; i++) {
1141 for (j=0; j < width; j++) {
1142 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1143 }
1144 src += stride;
1145 dst += stride;
1146 }
1147 }
1148
1149 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1150 int i,j;
1151 for (i=0; i < height; i++) {
1152 for (j=0; j < width; j++) {
1153 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1154 }
1155 src += stride;
1156 dst += stride;
1157 }
1158 }
1159 #if 0
1160 #define TPEL_WIDTH(width)\
1161 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1163 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1165 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1167 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1169 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1171 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1172 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1173 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1174 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1175 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1176 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1177 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1178 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1179 #endif
1180
1181 #define H264_CHROMA_MC(OPNAME, OP)\
1182 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1183 const int A=(8-x)*(8-y);\
1184 const int B=( x)*(8-y);\
1185 const int C=(8-x)*( y);\
1186 const int D=( x)*( y);\
1187 int i;\
1188 \
1189 assert(x<8 && y<8 && x>=0 && y>=0);\
1190 \
1191 for(i=0; i<h; i++)\
1192 {\
1193 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1194 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1195 dst+= stride;\
1196 src+= stride;\
1197 }\
1198 }\
1199 \
1200 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1201 const int A=(8-x)*(8-y);\
1202 const int B=( x)*(8-y);\
1203 const int C=(8-x)*( y);\
1204 const int D=( x)*( y);\
1205 int i;\
1206 \
1207 assert(x<8 && y<8 && x>=0 && y>=0);\
1208 \
1209 for(i=0; i<h; i++)\
1210 {\
1211 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1212 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1213 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1214 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1215 dst+= stride;\
1216 src+= stride;\
1217 }\
1218 }\
1219 \
1220 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1221 const int A=(8-x)*(8-y);\
1222 const int B=( x)*(8-y);\
1223 const int C=(8-x)*( y);\
1224 const int D=( x)*( y);\
1225 int i;\
1226 \
1227 assert(x<8 && y<8 && x>=0 && y>=0);\
1228 \
1229 for(i=0; i<h; i++)\
1230 {\
1231 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1232 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1233 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1234 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1235 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1236 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1237 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1238 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1239 dst+= stride;\
1240 src+= stride;\
1241 }\
1242 }
1243
1244 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1245 #define op_put(a, b) a = (((b) + 32)>>6)
1246
1247 H264_CHROMA_MC(put_ , op_put)
1248 H264_CHROMA_MC(avg_ , op_avg)
1249 #undef op_avg
1250 #undef op_put
1251
1252 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1253 {
1254 int i;
1255 for(i=0; i<h; i++)
1256 {
1257 ST32(dst , LD32(src ));
1258 dst+=dstStride;
1259 src+=srcStride;
1260 }
1261 }
1262
1263 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1264 {
1265 int i;
1266 for(i=0; i<h; i++)
1267 {
1268 ST32(dst , LD32(src ));
1269 ST32(dst+4 , LD32(src+4 ));
1270 dst+=dstStride;
1271 src+=srcStride;
1272 }
1273 }
1274
1275 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1276 {
1277 int i;
1278 for(i=0; i<h; i++)
1279 {
1280 ST32(dst , LD32(src ));
1281 ST32(dst+4 , LD32(src+4 ));
1282 ST32(dst+8 , LD32(src+8 ));
1283 ST32(dst+12, LD32(src+12));
1284 dst+=dstStride;
1285 src+=srcStride;
1286 }
1287 }
1288
1289 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1290 {
1291 int i;
1292 for(i=0; i<h; i++)
1293 {
1294 ST32(dst , LD32(src ));
1295 ST32(dst+4 , LD32(src+4 ));
1296 ST32(dst+8 , LD32(src+8 ));
1297 ST32(dst+12, LD32(src+12));
1298 dst[16]= src[16];
1299 dst+=dstStride;
1300 src+=srcStride;
1301 }
1302 }
1303
1304 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1305 {
1306 int i;
1307 for(i=0; i<h; i++)
1308 {
1309 ST32(dst , LD32(src ));
1310 ST32(dst+4 , LD32(src+4 ));
1311 dst[8]= src[8];
1312 dst+=dstStride;
1313 src+=srcStride;
1314 }
1315 }
1316
1317
1318 #define QPEL_MC(r, OPNAME, RND, OP) \
1319 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1320 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1321 int i;\
1322 for(i=0; i<h; i++)\
1323 {\
1324 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1325 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1326 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1327 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1328 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1329 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1330 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1331 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1332 dst+=dstStride;\
1333 src+=srcStride;\
1334 }\
1335 }\
1336 \
1337 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1338 const int w=8;\
1339 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1340 int i;\
1341 for(i=0; i<w; i++)\
1342 {\
1343 const int src0= src[0*srcStride];\
1344 const int src1= src[1*srcStride];\
1345 const int src2= src[2*srcStride];\
1346 const int src3= src[3*srcStride];\
1347 const int src4= src[4*srcStride];\
1348 const int src5= src[5*srcStride];\
1349 const int src6= src[6*srcStride];\
1350 const int src7= src[7*srcStride];\
1351 const int src8= src[8*srcStride];\
1352 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1353 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1354 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1355 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1356 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1357 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1358 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1359 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1360 dst++;\
1361 src++;\
1362 }\
1363 }\
1364 \
1365 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1366 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1367 int i;\
1368 \
1369 for(i=0; i<h; i++)\
1370 {\
1371 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1372 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1373 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1374 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1375 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1376 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1377 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1378 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1379 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1380 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1381 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1382 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1383 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1384 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1385 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1386 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1387 dst+=dstStride;\
1388 src+=srcStride;\
1389 }\
1390 }\
1391 \
1392 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1393 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1394 int i;\
1395 const int w=16;\
1396 for(i=0; i<w; i++)\
1397 {\
1398 const int src0= src[0*srcStride];\
1399 const int src1= src[1*srcStride];\
1400 const int src2= src[2*srcStride];\
1401 const int src3= src[3*srcStride];\
1402 const int src4= src[4*srcStride];\
1403 const int src5= src[5*srcStride];\
1404 const int src6= src[6*srcStride];\
1405 const int src7= src[7*srcStride];\
1406 const int src8= src[8*srcStride];\
1407 const int src9= src[9*srcStride];\
1408 const int src10= src[10*srcStride];\
1409 const int src11= src[11*srcStride];\
1410 const int src12= src[12*srcStride];\
1411 const int src13= src[13*srcStride];\
1412 const int src14= src[14*srcStride];\
1413 const int src15= src[15*srcStride];\
1414 const int src16= src[16*srcStride];\
1415 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1416 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1417 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1418 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1419 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1420 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1421 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1422 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1423 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1424 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1425 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1426 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1427 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1428 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1429 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1430 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1431 dst++;\
1432 src++;\
1433 }\
1434 }\
1435 \
1436 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1437 OPNAME ## pixels8_c(dst, src, stride, 8);\
1438 }\
1439 \
1440 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1441 uint8_t half[64];\
1442 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1443 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1444 }\
1445 \
1446 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1447 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1448 }\
1449 \
1450 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1451 uint8_t half[64];\
1452 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1453 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1454 }\
1455 \
1456 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1457 uint8_t full[16*9];\
1458 uint8_t half[64];\
1459 copy_block9(full, src, 16, stride, 9);\
1460 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1461 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1462 }\
1463 \
1464 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1465 uint8_t full[16*9];\
1466 copy_block9(full, src, 16, stride, 9);\
1467 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1468 }\
1469 \
1470 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1471 uint8_t full[16*9];\
1472 uint8_t half[64];\
1473 copy_block9(full, src, 16, stride, 9);\
1474 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1475 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1476 }\
1477 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1478 uint8_t full[16*9];\
1479 uint8_t halfH[72];\
1480 uint8_t halfV[64];\
1481 uint8_t halfHV[64];\
1482 copy_block9(full, src, 16, stride, 9);\
1483 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1484 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1485 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1486 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1487 }\
1488 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1489 uint8_t full[16*9];\
1490 uint8_t halfH[72];\
1491 uint8_t halfHV[64];\
1492 copy_block9(full, src, 16, stride, 9);\
1493 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1494 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1495 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1496 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1497 }\
1498 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[16*9];\
1500 uint8_t halfH[72];\
1501 uint8_t halfV[64];\
1502 uint8_t halfHV[64];\
1503 copy_block9(full, src, 16, stride, 9);\
1504 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1506 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1508 }\
1509 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1510 uint8_t full[16*9];\
1511 uint8_t halfH[72];\
1512 uint8_t halfHV[64];\
1513 copy_block9(full, src, 16, stride, 9);\
1514 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1516 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1518 }\
1519 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520 uint8_t full[16*9];\
1521 uint8_t halfH[72];\
1522 uint8_t halfV[64];\
1523 uint8_t halfHV[64];\
1524 copy_block9(full, src, 16, stride, 9);\
1525 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1526 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1527 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1529 }\
1530 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1531 uint8_t full[16*9];\
1532 uint8_t halfH[72];\
1533 uint8_t halfHV[64];\
1534 copy_block9(full, src, 16, stride, 9);\
1535 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1537 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1539 }\
1540 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541 uint8_t full[16*9];\
1542 uint8_t halfH[72];\
1543 uint8_t halfV[64];\
1544 uint8_t halfHV[64];\
1545 copy_block9(full, src, 16, stride, 9);\
1546 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1547 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1548 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1549 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1550 }\
1551 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1552 uint8_t full[16*9];\
1553 uint8_t halfH[72];\
1554 uint8_t halfHV[64];\
1555 copy_block9(full, src, 16, stride, 9);\
1556 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1558 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1560 }\
1561 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1562 uint8_t halfH[72];\
1563 uint8_t halfHV[64];\
1564 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1565 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1566 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1567 }\
1568 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1569 uint8_t halfH[72];\
1570 uint8_t halfHV[64];\
1571 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1572 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1573 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1574 }\
1575 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1576 uint8_t full[16*9];\
1577 uint8_t halfH[72];\
1578 uint8_t halfV[64];\
1579 uint8_t halfHV[64];\
1580 copy_block9(full, src, 16, stride, 9);\
1581 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1582 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1583 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1584 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1585 }\
1586 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1587 uint8_t full[16*9];\
1588 uint8_t halfH[72];\
1589 copy_block9(full, src, 16, stride, 9);\
1590 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1591 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1592 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1593 }\
1594 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1595 uint8_t full[16*9];\
1596 uint8_t halfH[72];\
1597 uint8_t halfV[64];\
1598 uint8_t halfHV[64];\
1599 copy_block9(full, src, 16, stride, 9);\
1600 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1601 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1602 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1603 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1604 }\
1605 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1606 uint8_t full[16*9];\
1607 uint8_t halfH[72];\
1608 copy_block9(full, src, 16, stride, 9);\
1609 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1610 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1611 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1612 }\
1613 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1614 uint8_t halfH[72];\
1615 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1616 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1617 }\
1618 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1619 OPNAME ## pixels16_c(dst, src, stride, 16);\
1620 }\
1621 \
1622 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1623 uint8_t half[256];\
1624 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1625 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1626 }\
1627 \
1628 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1629 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1630 }\
1631 \
1632 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1633 uint8_t half[256];\
1634 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1635 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1636 }\
1637 \
1638 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1639 uint8_t full[24*17];\
1640 uint8_t half[256];\
1641 copy_block17(full, src, 24, stride, 17);\
1642 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1643 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1644 }\
1645 \
1646 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1647 uint8_t full[24*17];\
1648 copy_block17(full, src, 24, stride, 17);\
1649 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1650 }\
1651 \
1652 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1653 uint8_t full[24*17];\
1654 uint8_t half[256];\
1655 copy_block17(full, src, 24, stride, 17);\
1656 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1657 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1658 }\
1659 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1660 uint8_t full[24*17];\
1661 uint8_t halfH[272];\
1662 uint8_t halfV[256];\
1663 uint8_t halfHV[256];\
1664 copy_block17(full, src, 24, stride, 17);\
1665 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1666 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1667 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1668 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1669 }\
1670 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1671 uint8_t full[24*17];\
1672 uint8_t halfH[272];\
1673 uint8_t halfHV[256];\
1674 copy_block17(full, src, 24, stride, 17);\
1675 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1676 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1677 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1678 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1679 }\
1680 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[24*17];\
1682 uint8_t halfH[272];\
1683 uint8_t halfV[256];\
1684 uint8_t halfHV[256];\
1685 copy_block17(full, src, 24, stride, 17);\
1686 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1688 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690 }\
1691 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1692 uint8_t full[24*17];\
1693 uint8_t halfH[272];\
1694 uint8_t halfHV[256];\
1695 copy_block17(full, src, 24, stride, 17);\
1696 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1698 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1700 }\
1701 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702 uint8_t full[24*17];\
1703 uint8_t halfH[272];\
1704 uint8_t halfV[256];\
1705 uint8_t halfHV[256];\
1706 copy_block17(full, src, 24, stride, 17);\
1707 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1708 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1709 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711 }\
1712 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[24*17];\
1714 uint8_t halfH[272];\
1715 uint8_t halfHV[256];\
1716 copy_block17(full, src, 24, stride, 17);\
1717 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1719 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1721 }\
1722 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723 uint8_t full[24*17];\
1724 uint8_t halfH[272];\
1725 uint8_t halfV[256];\
1726 uint8_t halfHV[256];\
1727 copy_block17(full, src, 24, stride, 17);\
1728 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1729 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1730 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1731 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1732 }\
1733 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1734 uint8_t full[24*17];\
1735 uint8_t halfH[272];\
1736 uint8_t halfHV[256];\
1737 copy_block17(full, src, 24, stride, 17);\
1738 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1740 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1742 }\
1743 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t halfH[272];\
1745 uint8_t halfHV[256];\
1746 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1747 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1748 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1749 }\
1750 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1751 uint8_t halfH[272];\
1752 uint8_t halfHV[256];\
1753 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1754 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1755 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1756 }\
1757 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1758 uint8_t full[24*17];\
1759 uint8_t halfH[272];\
1760 uint8_t halfV[256];\
1761 uint8_t halfHV[256];\
1762 copy_block17(full, src, 24, stride, 17);\
1763 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1764 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1765 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1766 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1767 }\
1768 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1769 uint8_t full[24*17];\
1770 uint8_t halfH[272];\
1771 copy_block17(full, src, 24, stride, 17);\
1772 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1773 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1774 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1775 }\
1776 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t full[24*17];\
1778 uint8_t halfH[272];\
1779 uint8_t halfV[256];\
1780 uint8_t halfHV[256];\
1781 copy_block17(full, src, 24, stride, 17);\
1782 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1783 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1784 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1785 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1786 }\
1787 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1788 uint8_t full[24*17];\
1789 uint8_t halfH[272];\
1790 copy_block17(full, src, 24, stride, 17);\
1791 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1792 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1793 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1794 }\
1795 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1796 uint8_t halfH[272];\
1797 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1798 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1799 }
1800
1801 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1802 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1803 #define op_put(a, b) a = cm[((b) + 16)>>5]
1804 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1805
1806 QPEL_MC(0, put_ , _ , op_put)
1807 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1808 QPEL_MC(0, avg_ , _ , op_avg)
1809 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1810 #undef op_avg
1811 #undef op_avg_no_rnd
1812 #undef op_put
1813 #undef op_put_no_rnd
1814
1815 #if 1
1816 #define H264_LOWPASS(OPNAME, OP, OP2) \
1817 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1818 const int h=4;\
1819 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1820 int i;\
1821 for(i=0; i<h; i++)\
1822 {\
1823 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1824 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1825 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1826 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1827 dst+=dstStride;\
1828 src+=srcStride;\
1829 }\
1830 }\
1831 \
1832 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1833 const int w=4;\
1834 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1835 int i;\
1836 for(i=0; i<w; i++)\
1837 {\
1838 const int srcB= src[-2*srcStride];\
1839 const int srcA= src[-1*srcStride];\
1840 const int src0= src[0 *srcStride];\
1841 const int src1= src[1 *srcStride];\
1842 const int src2= src[2 *srcStride];\
1843 const int src3= src[3 *srcStride];\
1844 const int src4= src[4 *srcStride];\
1845 const int src5= src[5 *srcStride];\
1846 const int src6= src[6 *srcStride];\
1847 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1848 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1849 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1850 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1851 dst++;\
1852 src++;\
1853 }\
1854 }\
1855 \
1856 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1857 const int h=4;\
1858 const int w=4;\
1859 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1860 int i;\
1861 src -= 2*srcStride;\
1862 for(i=0; i<h+5; i++)\
1863 {\
1864 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1865 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1866 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1867 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1868 tmp+=tmpStride;\
1869 src+=srcStride;\
1870 }\
1871 tmp -= tmpStride*(h+5-2);\
1872 for(i=0; i<w; i++)\
1873 {\
1874 const int tmpB= tmp[-2*tmpStride];\
1875 const int tmpA= tmp[-1*tmpStride];\
1876 const int tmp0= tmp[0 *tmpStride];\
1877 const int tmp1= tmp[1 *tmpStride];\
1878 const int tmp2= tmp[2 *tmpStride];\
1879 const int tmp3= tmp[3 *tmpStride];\
1880 const int tmp4= tmp[4 *tmpStride];\
1881 const int tmp5= tmp[5 *tmpStride];\
1882 const int tmp6= tmp[6 *tmpStride];\
1883 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1884 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1885 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1886 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1887 dst++;\
1888 tmp++;\
1889 }\
1890 }\
1891 \
1892 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1893 const int h=8;\
1894 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1895 int i;\
1896 for(i=0; i<h; i++)\
1897 {\
1898 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1899 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1900 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1901 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1902 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1903 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1904 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1905 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1906 dst+=dstStride;\
1907 src+=srcStride;\
1908 }\
1909 }\
1910 \
1911 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1912 const int w=8;\
1913 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1914 int i;\
1915 for(i=0; i<w; i++)\
1916 {\
1917 const int srcB= src[-2*srcStride];\
1918 const int srcA= src[-1*srcStride];\
1919 const int src0= src[0 *srcStride];\
1920 const int src1= src[1 *srcStride];\
1921 const int src2= src[2 *srcStride];\
1922 const int src3= src[3 *srcStride];\
1923 const int src4= src[4 *srcStride];\
1924 const int src5= src[5 *srcStride];\
1925 const int src6= src[6 *srcStride];\
1926 const int src7= src[7 *srcStride];\
1927 const int src8= src[8 *srcStride];\
1928 const int src9= src[9 *srcStride];\
1929 const int src10=src[10*srcStride];\
1930 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1931 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1932 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1933 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1934 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1935 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1936 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1937 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1938 dst++;\
1939 src++;\
1940 }\
1941 }\
1942 \
1943 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1944 const int h=8;\
1945 const int w=8;\
1946 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1947 int i;\
1948 src -= 2*srcStride;\
1949 for(i=0; i<h+5; i++)\
1950 {\
1951 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1952 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1953 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1954 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1955 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1956 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1957 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1958 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1959 tmp+=tmpStride;\
1960 src+=srcStride;\
1961 }\
1962 tmp -= tmpStride*(h+5-2);\
1963 for(i=0; i<w; i++)\
1964 {\
1965 const int tmpB= tmp[-2*tmpStride];\
1966 const int tmpA= tmp[-1*tmpStride];\
1967 const int tmp0= tmp[0 *tmpStride];\
1968 const int tmp1= tmp[1 *tmpStride];\
1969 const int tmp2= tmp[2 *tmpStride];\
1970 const int tmp3= tmp[3 *tmpStride];\
1971 const int tmp4= tmp[4 *tmpStride];\
1972 const int tmp5= tmp[5 *tmpStride];\
1973 const int tmp6= tmp[6 *tmpStride];\
1974 const int tmp7= tmp[7 *tmpStride];\
1975 const int tmp8= tmp[8 *tmpStride];\
1976 const int tmp9= tmp[9 *tmpStride];\
1977 const int tmp10=tmp[10*tmpStride];\
1978 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1979 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1980 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1981 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1982 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1983 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1984 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1985 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1986 dst++;\
1987 tmp++;\
1988 }\
1989 }\
1990 \
1991 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1992 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1993 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1994 src += 8*srcStride;\
1995 dst += 8*dstStride;\
1996 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1997 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1998 }\
1999 \
2000 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2001 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2002 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2003 src += 8*srcStride;\
2004 dst += 8*dstStride;\
2005 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2006 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2007 }\
2008 \
2009 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2010 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2011 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2012 src += 8*srcStride;\
2013 tmp += 8*tmpStride;\
2014 dst += 8*dstStride;\
2015 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2016 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2017 }\
2018
2019 #define H264_MC(OPNAME, SIZE) \
2020 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2021 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2022 }\
2023 \
2024 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2025 uint8_t half[SIZE*SIZE];\
2026 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2027 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2028 }\
2029 \
2030 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2031 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2032 }\
2033 \
2034 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2035 uint8_t half[SIZE*SIZE];\
2036 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2037 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2038 }\
2039 \
2040 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2041 uint8_t full[SIZE*(SIZE+5)];\
2042 uint8_t * const full_mid= full + SIZE*2;\
2043 uint8_t half[SIZE*SIZE];\
2044 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2045 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2046 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2047 }\
2048 \
2049 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2050 uint8_t full[SIZE*(SIZE+5)];\
2051 uint8_t * const full_mid= full + SIZE*2;\
2052 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2053 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2054 }\
2055 \
2056 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2057 uint8_t full[SIZE*(SIZE+5)];\
2058 uint8_t * const full_mid= full + SIZE*2;\
2059 uint8_t half[SIZE*SIZE];\
2060 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2061 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2062 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2063 }\
2064 \
2065 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2066 uint8_t full[SIZE*(SIZE+5)];\
2067 uint8_t * const full_mid= full + SIZE*2;\
2068 uint8_t halfH[SIZE*SIZE];\
2069 uint8_t halfV[SIZE*SIZE];\
2070 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2071 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2072 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2073 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2074 }\
2075 \
2076 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2077 uint8_t full[SIZE*(SIZE+5)];\
2078 uint8_t * const full_mid= full + SIZE*2;\
2079 uint8_t halfH[SIZE*SIZE];\
2080 uint8_t halfV[SIZE*SIZE];\
2081 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2082 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2083 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2084 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2085 }\
2086 \
2087 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2088 uint8_t full[SIZE*(SIZE+5)];\
2089 uint8_t * const full_mid= full + SIZE*2;\
2090 uint8_t halfH[SIZE*SIZE];\
2091 uint8_t halfV[SIZE*SIZE];\
2092 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2093 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2094 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2095 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2096 }\
2097 \
2098 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2099 uint8_t full[SIZE*(SIZE+5)];\
2100 uint8_t * const full_mid= full + SIZE*2;\
2101 uint8_t halfH[SIZE*SIZE];\
2102 uint8_t halfV[SIZE*SIZE];\
2103 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2104 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2105 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2106 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2107 }\
2108 \
2109 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2110 int16_t tmp[SIZE*(SIZE+5)];\
2111 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2112 }\
2113 \
2114 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2115 int16_t tmp[SIZE*(SIZE+5)];\
2116 uint8_t halfH[SIZE*SIZE];\
2117 uint8_t halfHV[SIZE*SIZE];\
2118 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2119 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2120 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2121 }\
2122 \
2123 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2124 int16_t tmp[SIZE*(SIZE+5)];\
2125 uint8_t halfH[SIZE*SIZE];\
2126 uint8_t halfHV[SIZE*SIZE];\
2127 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2128 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2129 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2130 }\
2131 \
2132 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2133 uint8_t full[SIZE*(SIZE+5)];\
2134 uint8_t * const full_mid= full + SIZE*2;\
2135 int16_t tmp[SIZE*(SIZE+5)];\
2136 uint8_t halfV[SIZE*SIZE];\
2137 uint8_t halfHV[SIZE*SIZE];\
2138 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2139 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2140 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2142 }\
2143 \
2144 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2145 uint8_t full[SIZE*(SIZE+5)];\
2146 uint8_t * const full_mid= full + SIZE*2;\
2147 int16_t tmp[SIZE*(SIZE+5)];\
2148 uint8_t halfV[SIZE*SIZE];\
2149 uint8_t halfHV[SIZE*SIZE];\
2150 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2151 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2152 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2153 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2154 }\
2155
2156 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2157 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2158 #define op_put(a, b) a = cm[((b) + 16)>>5]
2159 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2160 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2161
2162 H264_LOWPASS(put_ , op_put, op2_put)
2163 H264_LOWPASS(avg_ , op_avg, op2_avg)
2164 H264_MC(put_, 4)
2165 H264_MC(put_, 8)
2166 H264_MC(put_, 16)
2167 H264_MC(avg_, 4)
2168 H264_MC(avg_, 8)
2169 H264_MC(avg_, 16)
2170
2171 #undef op_avg
2172 #undef op_put
2173 #undef op2_avg
2174 #undef op2_put
2175 #endif
2176
2177 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2178 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2179 int i;
2180
2181 for(i=0; i<h; i++){
2182 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2183 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2184 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2185 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2186 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2187 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2188 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2189 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2190 dst+=dstStride;
2191 src+=srcStride;
2192 }
2193 }
2194
2195 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2196 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2197 int i;
2198
2199 for(i=0; i<w; i++){
2200 const int src_1= src[ -srcStride];
2201 const int src0 = src[0 ];
2202 const int src1 = src[ srcStride];
2203 const int src2 = src[2*srcStride];
2204 const int src3 = src[3*srcStride];
2205 const int src4 = src[4*srcStride];
2206 const int src5 = src[5*srcStride];
2207 const int src6 = src[6*srcStride];
2208 const int src7 = src[7*srcStride];
2209 const int src8 = src[8*srcStride];
2210 const int src9 = src[9*srcStride];
2211 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2212 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2213 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2214 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2215 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2216 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2217 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2218 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2219 src++;
2220 dst++;
2221 }
2222 }
2223
2224 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2225 put_pixels8_c(dst, src, stride, 8);
2226 }
2227
2228 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2229 uint8_t half[64];
2230 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2231 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2232 }
2233
2234 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2235 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2236 }
2237
2238 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2239 uint8_t half[64];
2240 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2241 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2242 }
2243
2244 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2245 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2246 }
2247
2248 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2249 uint8_t halfH[88];
2250 uint8_t halfV[64];
2251 uint8_t halfHV[64];
2252 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2253 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2254 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2255 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2256 }
2257 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2258 uint8_t halfH[88];
2259 uint8_t halfV[64];
2260 uint8_t halfHV[64];
2261 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2262 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2263 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2264 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2265 }
2266 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2267 uint8_t halfH[88];
2268 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2269 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2270 }
2271
2272 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2273 int x;
2274 const int strength= ff_h263_loop_filter_strength[qscale];
2275
2276 for(x=0; x<8; x++){
2277 int d1, d2, ad1;
2278 int p0= src[x-2*stride];
2279 int p1= src[x-1*stride];
2280 int p2= src[x+0*stride];
2281 int p3= src[x+1*stride];
2282 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2283
2284 if (d<-2*strength) d1= 0;
2285 else if(d<- strength) d1=-2*strength - d;
2286 else if(d< strength) d1= d;
2287 else if(d< 2*strength) d1= 2*strength - d;
2288 else d1= 0;
2289
2290 p1 += d1;
2291 p2 -= d1;
2292 if(p1&256) p1= ~(p1>>31);
2293 if(p2&256) p2= ~(p2>>31);
2294
2295 src[x-1*stride] = p1;
2296 src[x+0*stride] = p2;
2297
2298 ad1= ABS(d1)>>1;
2299
2300 d2= clip((p0-p3)/4, -ad1, ad1);
2301
2302 src[x-2*stride] = p0 - d2;
2303 src[x+ stride] = p3 + d2;
2304 }
2305 }
2306
2307 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2308 int y;
2309 const int strength= ff_h263_loop_filter_strength[qscale];
2310
2311 for(y=0; y<8; y++){
2312 int d1, d2, ad1;
2313 int p0= src[y*stride-2];
2314 int p1= src[y*stride-1];
2315 int p2= src[y*stride+0];
2316 int p3= src[y*stride+1];
2317 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2318
2319 if (d<-2*strength) d1= 0;
2320 else if(d<- strength) d1=-2*strength - d;
2321 else if(d< strength) d1= d;
2322 else if(d< 2*strength) d1= 2*strength - d;
2323 else d1= 0;
2324
2325 p1 += d1;
2326 p2 -= d1;
2327 if(p1&256) p1= ~(p1>>31);
2328 if(p2&256) p2= ~(p2>>31);
2329
2330 src[y*stride-1] = p1;
2331 src[y*stride+0] = p2;
2332
2333 ad1= ABS(d1)>>1;
2334
2335 d2= clip((p0-p3)/4, -ad1, ad1);
2336
2337 src[y*stride-2] = p0 - d2;
2338 src[y*stride+1] = p3 + d2;
2339 }
2340 }
2341
2342 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2343 {
2344 int s, i;
2345
2346 s = 0;
2347 for(i=0;i<h;i++) {
2348 s += abs(pix1[0] - pix2[0]);
2349 s += abs(pix1[1] - pix2[1]);
2350 s += abs(pix1[2] - pix2[2]);
2351 s += abs(pix1[3] - pix2[3]);
2352 s += abs(pix1[4] - pix2[4]);
2353 s += abs(pix1[5] - pix2[5]);
2354 s += abs(pix1[6] - pix2[6]);
2355 s += abs(pix1[7] - pix2[7]);
2356 s += abs(pix1[8] - pix2[8]);
2357 s += abs(pix1[9] - pix2[9]);
2358 s += abs(pix1[10] - pix2[10]);
2359 s += abs(pix1[11] - pix2[11]);
2360 s += abs(pix1[12] - pix2[12]);
2361 s += abs(pix1[13] - pix2[13]);
2362 s += abs(pix1[14] - pix2[14]);
2363 s += abs(pix1[15] - pix2[15]);
2364 pix1 += line_size;
2365 pix2 += line_size;
2366 }
2367 return s;
2368 }
2369
2370 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2371 {
2372 int s, i;
2373
2374 s = 0;
2375 for(i=0;i<h;i++) {
2376 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2377 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2378 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2379 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2380 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2381 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2382 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2383 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2384 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2385 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2386 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2387 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2388 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2389 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2390 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2391 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2392 pix1 += line_size;
2393 pix2 += line_size;
2394 }
2395 return s;
2396 }
2397
2398 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2399 {
2400 int s, i;
2401 uint8_t *pix3 = pix2 + line_size;
2402
2403 s = 0;
2404 for(i=0;i<h;i++) {
2405 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2406 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2407 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2408 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2409 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2410 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2411 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2412 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2413 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2414 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2415 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2416 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2417 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2418 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2419 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2420 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2421 pix1 += line_size;
2422 pix2 += line_size;
2423 pix3 += line_size;
2424 }
2425 return s;
2426 }
2427
2428 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2429 {
2430 int s, i;
2431 uint8_t *pix3 = pix2 + line_size;
2432
2433 s = 0;
2434 for(i=0;i<h;i++) {
2435 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2436 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2437 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2438 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2439 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2440 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2441 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2442 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2443 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2444 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2445 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2446 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2447 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2448 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2449 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2450 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2451 pix1 += line_size;
2452 pix2 += line_size;
2453 pix3 += line_size;
2454 }
2455 return s;
2456 }
2457
2458 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2459 {
2460 int s, i;
2461
2462 s = 0;
2463 for(i=0;i<h;i++) {
2464 s += abs(pix1[0] - pix2[0]);
2465 s += abs(pix1[1] - pix2[1]);
2466 s += abs(pix1[2] - pix2[2]);
2467 s += abs(pix1[3] - pix2[3]);
2468 s += abs(pix1[4] - pix2[4]);
2469 s += abs(pix1[5] - pix2[5]);
2470 s += abs(pix1[6] - pix2[6]);
2471 s += abs(pix1[7] - pix2[7]);
2472 pix1 += line_size;
2473 pix2 += line_size;
2474 }
2475 return s;
2476 }
2477
2478 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2479 {
2480 int s, i;
2481
2482 s = 0;
2483 for(i=0;i<h;i++) {
2484 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2485 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2486 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2487 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2488 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2489 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2490 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2491 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2492 pix1 += line_size;
2493 pix2 += line_size;
2494 }
2495 return s;
2496 }
2497
2498 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2499 {
2500 int s, i;
2501 uint8_t *pix3 = pix2 + line_size;
2502
2503 s = 0;
2504 for(i=0;i<h;i++) {
2505 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2506 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2507 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2508 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2509 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2510 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2511 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2512 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2513 pix1 += line_size;
2514 pix2 += line_size;
2515 pix3 += line_size;
2516 }
2517 return s;
2518 }
2519
2520 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2521 {
2522 int s, i;
2523 uint8_t *pix3 = pix2 + line_size;
2524
2525 s = 0;
2526 for(i=0;i<h;i++) {
2527 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2528 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2529 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2530 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2531 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2532 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2533 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2534 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2535 pix1 += line_size;
2536 pix2 += line_size;
2537 pix3 += line_size;
2538 }
2539 return s;
2540 }
2541
2542 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2543 int i;
2544 unsigned int sum=0;
2545
2546 for(i=0; i<8*8; i++){
2547 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2548 int w= weight[i];
2549 b>>= RECON_SHIFT;
2550 assert(-512<b && b<512);
2551
2552 sum += (w*b)*(w*b)>>4;
2553 }
2554 return sum>>2;
2555 }
2556
2557 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2558 int i;
2559
2560 for(i=0; i<8*8; i++){
2561 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2562 }
2563 }
2564
2565 /**
2566 * permutes an 8x8 block.
2567 * @param block the block which will be permuted according to the given permutation vector
2568 * @param permutation the permutation vector
2569 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2570 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2571 * (inverse) permutated to scantable order!
2572 */
2573 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2574 {
2575 int i;
2576 DCTELEM temp[64];
2577
2578 if(last<=0) return;
2579 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2580
2581 for(i=0; i<=last; i++){
2582 const int j= scantable[i];
2583 temp[j]= block[j];
2584 block[j]=0;
2585 }
2586
2587 for(i=0; i<=last; i++){
2588 const int j= scantable[i];
2589 const int perm_j= permutation[j];
2590 block[perm_j]= temp[j];
2591 }
2592 }
2593
2594 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2595 return 0;
2596 }
2597
2598 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2599 int i;
2600
2601 memset(cmp, 0, sizeof(void*)*5);
2602
2603 for(i=0; i<5; i++){
2604 switch(type&0xFF){
2605 case FF_CMP_SAD:
2606 cmp[i]= c->sad[i];
2607 break;
2608 case FF_CMP_SATD:
2609 cmp[i]= c->hadamard8_diff[i];
2610 break;
2611 case FF_CMP_SSE:
2612 cmp[i]= c->sse[i];
2613 break;
2614 case FF_CMP_DCT:
2615 cmp[i]= c->dct_sad[i];
2616 break;
2617 case FF_CMP_PSNR:
2618 cmp[i]= c->quant_psnr[i];
2619 break;
2620 case FF_CMP_BIT:
2621 cmp[i]= c->bit[i];
2622 break;
2623 case FF_CMP_RD:
2624 cmp[i]= c->rd[i];
2625 break;
2626 case FF_CMP_VSAD:
2627 cmp[i]= c->vsad[i];
2628 break;
2629 case FF_CMP_VSSE:
2630 cmp[i]= c->vsse[i];
2631 break;
2632 case FF_CMP_ZERO:
2633 cmp[i]= zero_cmp;
2634 break;
2635 default:
2636 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2637 }
2638 }
2639 }
2640
2641 /**
2642 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2643 */
2644 static void clear_blocks_c(DCTELEM *blocks)
2645 {
2646 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2647 }
2648
2649 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2650 int i;
2651 for(i=0; i+7<w; i+=8){
2652 dst[i+0] += src[i+0];
2653 dst[i+1] += src[i+1];
2654 dst[i+2] += src[i+2];
2655 dst[i+3] += src[i+3];
2656 dst[i+4] += src[i+4];
2657 dst[i+5] += src[i+5];
2658 dst[i+6] += src[i+6];
2659 dst[i+7] += src[i+7];
2660 }
2661 for(; i<w; i++)
2662 dst[i+0] += src[i+0];
2663 }
2664
2665 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2666 int i;
2667 for(i=0; i+7<w; i+=8){
2668 dst[i+0] = src1[i+0]-src2[i+0];
2669 dst[i+1] = src1[i+1]-src2[i+1];
2670 dst[i+2] = src1[i+2]-src2[i+2];
2671 dst[i+3] = src1[i+3]-src2[i+3];
2672 dst[i+4] = src1[i+4]-src2[i+4];
2673 dst[i+5] = src1[i+5]-src2[i+5];
2674 dst[i+6] = src1[i+6]-src2[i+6];
2675 dst[i+7] = src1[i+7]-src2[i+7];
2676 }
2677 for(; i<w; i++)
2678 dst[i+0] = src1[i+0]-src2[i+0];
2679 }
2680
2681 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2682 int i;
2683 uint8_t l, lt;
2684
2685 l= *left;
2686 lt= *left_top;
2687
2688 for(i=0; i<w; i++){
2689 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2690 lt= src1[i];
2691 l= src2[i];
2692 dst[i]= l - pred;
2693 }
2694
2695 *left= l;
2696 *left_top= lt;
2697 }
2698
2699 #define BUTTERFLY2(o1,o2,i1,i2) \
2700 o1= (i1)+(i2);\
2701 o2= (i1)-(i2);
2702
2703 #define BUTTERFLY1(x,y) \
2704 {\
2705 int a,b;\
2706 a= x;\
2707 b= y;\
2708 x= a+b;\
2709 y= a-b;\
2710 }
2711
2712 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2713
2714 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2715 int i;
2716 int temp[64];
2717 int sum=0;
2718
2719 assert(h==8);
2720
2721 for(i=0; i<8; i++){
2722 //FIXME try pointer walks
2723 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2724 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2725 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2726 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2727
2728 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2729 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2730 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2731 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2732
2733 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2734 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2735 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2736 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2737 }
2738
2739 for(i=0; i<8; i++){
2740 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2741 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2742 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2743 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2744
2745 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2746 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2747 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2748 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2749
2750 sum +=
2751 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2752 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2753 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2754 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2755 }
2756 #if 0
2757 static int maxi=0;
2758 if(sum>maxi){
2759 maxi=sum;
2760 printf("MAX:%d\n", maxi);
2761 }
2762 #endif
2763 return sum;
2764 }
2765
2766 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2767 int i;
2768 int temp[64];
2769 int sum=0;
2770
2771 assert(h==8);
2772
2773 for(i=0; i<8; i++){
2774 //FIXME try pointer walks
2775 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2776 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2777 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2778 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2779
2780 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2781 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2782 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2783 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2784
2785 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2786 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2787 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2788 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2789 }
2790
2791 for(i=0; i<8; i++){
2792 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2793 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2794 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2795 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2796
2797 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2798 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2799 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2800 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2801
2802 sum +=
2803 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2804 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2805 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2806 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2807 }
2808
2809 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2810
2811 return sum;
2812 }
2813
2814 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2815 MpegEncContext * const s= (MpegEncContext *)c;
2816 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2817 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2818 int sum=0, i;
2819
2820 assert(h==8);
2821
2822 s->dsp.diff_pixels(temp, src1, src2, stride);
2823 s->dsp.fdct(temp);
2824
2825 for(i=0; i<64; i++)
2826 sum+= ABS(temp[i]);
2827
2828 return sum;
2829 }
2830
2831 void simple_idct(DCTELEM *block); //FIXME
2832
2833 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2834 MpegEncContext * const s= (MpegEncContext *)c;
2835 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2836 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2837 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2838 int sum=0, i;
2839
2840 assert(h==8);
2841 s->mb_intra=0;
2842
2843 s->dsp.diff_pixels(temp, src1, src2, stride);
2844
2845 memcpy(bak, temp, 64*sizeof(DCTELEM));
2846
2847 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2848 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2849 simple_idct(temp); //FIXME
2850
2851 for(i=0; i<64; i++)
2852 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2853
2854 return sum;
2855 }
2856
2857 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2858 MpegEncContext * const s= (MpegEncContext *)c;
2859 const uint8_t *scantable= s->intra_scantable.permutated;
2860 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2861 uint64_t __align8 aligned_bak[stride];
2862 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2863 uint8_t * const bak= (uint8_t*)aligned_bak;
2864 int i, last, run, bits, level, distoration, start_i;
2865 const int esc_length= s->ac_esc_length;
2866 uint8_t * length;
2867 uint8_t * last_length;
2868
2869 assert(h==8);
2870
2871 for(i=0; i<8; i++){
2872 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2873 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2874 }
2875
2876 s->dsp.diff_pixels(temp, src1, src2, stride);
2877
2878 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2879
2880 bits=0;
2881
2882 if (s->mb_intra) {
2883 start_i = 1;
2884 length = s->intra_ac_vlc_length;
2885 last_length= s->intra_ac_vlc_last_length;
2886 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2887 } else {
2888 start_i = 0;
2889 length = s->inter_ac_vlc_length;
2890 last_length= s->inter_ac_vlc_last_length;
2891 }
2892
2893 if(last>=start_i){
2894 run=0;
2895 for(i=start_i; i<last; i++){
2896 int j= scantable[i];
2897 level= temp[j];
2898
2899 if(level){
2900 level+=64;
2901 if((level&(~127)) == 0){
2902 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2903 }else
2904 bits+= esc_length;
2905 run=0;
2906 }else
2907 run++;
2908 }
2909 i= scantable[last];
2910
2911 level= temp[i] + 64;
2912
2913 assert(level - 64);
2914
2915 if((level&(~127)) == 0){
2916 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2917 }else
2918 bits+= esc_length;
2919
2920 }
2921
2922 if(last>=0){
2923 if(s->mb_intra)
2924 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2925 else
2926 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2927 }
2928
2929 s->dsp.idct_add(bak, stride, temp);
2930
2931 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2932
2933 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2934 }
2935
2936 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2937 MpegEncContext * const s= (MpegEncContext *)c;
2938 const uint8_t *scantable= s->intra_scantable.permutated;
2939 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2940 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2941 int i, last, run, bits, level, start_i;
2942 const int esc_length= s->ac_esc_length;
2943 uint8_t * length;
2944 uint8_t * last_length;
2945
2946 assert(h==8);
2947
2948 s->dsp.diff_pixels(temp, src1, src2, stride);
2949
2950 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2951
2952 bits=0;
2953
2954 if (s->mb_intra) {
2955 start_i = 1;
2956 length = s->intra_ac_vlc_length;
2957 last_length= s->intra_ac_vlc_last_length;
2958 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2959 } else {
2960 start_i = 0;
2961 length = s->inter_ac_vlc_length;
2962 last_length= s->inter_ac_vlc_last_length;
2963 }
2964
2965 if(last>=start_i){
2966 run=0;
2967 for(i=start_i; i<last; i++){
2968 int j= scantable[i];
2969 level= temp[j];
2970
2971 if(level){
2972 level+=64;
2973 if((level&(~127)) == 0){
2974 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2975 }else
2976 bits+= esc_length;
2977 run=0;
2978 }else
2979 run++;
2980 }
2981 i= scantable[last];
2982
2983 level= temp[i] + 64;
2984
2985 assert(level - 64);
2986
2987 if((level&(~127)) == 0){
2988 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2989 }else
2990 bits+= esc_length;
2991 }
2992
2993 return bits;
2994 }
2995
2996 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2997 int score=0;
2998 int x,y;
2999
3000 for(y=1; y<h; y++){
3001 for(x=0; x<16; x+=4){
3002 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3003 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3004 }
3005 s+= stride;
3006 }
3007
3008 return score;
3009 }
3010
3011 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3012 int score=0;
3013 int x,y;
3014
3015 for(y=1; y<h; y++){
3016 for(x=0; x<16; x++){
3017 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3018 }
3019 s1+= stride;
3020 s2+= stride;
3021 }
3022
3023 return score;
3024 }
3025
3026 #define SQ(a) ((a)*(a))
3027 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3028 int score=0;
3029 int x,y;
3030
3031 for(y=1; y<h; y++){
3032 for(x=0; x<16; x+=4){
3033 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3034 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3035 }
3036 s+= stride;
3037 }
3038
3039 return score;
3040 }
3041
3042 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3043 int score=0;
3044 int x,y;
3045
3046 for(y=1; y<h; y++){
3047 for(x=0; x<16; x++){
3048 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3049 }
3050 s1+= stride;
3051 s2+= stride;
3052 }
3053
3054 return score;
3055 }
3056
3057 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3058 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3059 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3060 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3061 WARPER8_16_SQ(rd8x8_c, rd16_c)
3062 WARPER8_16_SQ(bit8x8_c, bit16_c)
3063
3064 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3065 converted */
3066 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3067 {
3068 j_rev_dct (block);
3069 put_pixels_clamped_c(block, dest, line_size);
3070 }
3071 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3072 {
3073 j_rev_dct (block);
3074 add_pixels_clamped_c(block, dest, line_size);
3075 }
3076
3077 /* init static data */
3078 void dsputil_static_init(void)
3079 {
3080 int i;
3081
3082 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3083 for(i=0;i<MAX_NEG_CROP;i++) {
3084 cropTbl[i] = 0;
3085 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3086 }
3087
3088 for(i=0;i<512;i++) {
3089 squareTbl[i] = (i - 256) * (i - 256);
3090 }
3091
3092 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3093 }
3094
3095
3096 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3097 {
3098 int i;
3099
3100 #ifdef CONFIG_ENCODERS
3101 if(avctx->dct_algo==FF_DCT_FASTINT) {
3102 c->fdct = fdct_ifast;
3103 c->fdct248 = fdct_ifast248;
3104 }
3105 else if(avctx->dct_algo==FF_DCT_FAAN) {
3106 c->fdct = ff_faandct;
3107 c->fdct248 = ff_faandct248;
3108 }
3109 else {
3110 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3111 c->fdct248 = ff_fdct248_islow;
3112 }
3113 #endif //CONFIG_ENCODERS
3114
3115 if(avctx->idct_algo==FF_IDCT_INT){
3116 c->idct_put= ff_jref_idct_put;
3117 c->idct_add= ff_jref_idct_add;
3118 c->idct = j_rev_dct;
3119 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3120 }else{ //accurate/default
3121 c->idct_put= simple_idct_put;
3122 c->idct_add= simple_idct_add;
3123 c->idct = simple_idct;
3124 c->idct_permutation_type= FF_NO_IDCT_PERM;
3125 }
3126
3127 /* VP3 DSP support */
3128 c->vp3_dsp_init = vp3_dsp_init_c;
3129 c->vp3_idct = vp3_idct_c;
3130
3131 c->get_pixels = get_pixels_c;
3132 c->diff_pixels = diff_pixels_c;
3133 c->put_pixels_clamped = put_pixels_clamped_c;
3134 c->add_pixels_clamped = add_pixels_clamped_c;
3135 c->gmc1 = gmc1_c;
3136 c->gmc = gmc_c;
3137 c->clear_blocks = clear_blocks_c;
3138 c->pix_sum = pix_sum_c;
3139 c->pix_norm1 = pix_norm1_c;
3140
3141 /* TODO [0] 16 [1] 8 */
3142 c->pix_abs[0][0] = pix_abs16_c;
3143 c->pix_abs[0][1] = pix_abs16_x2_c;
3144 c->pix_abs[0][2] = pix_abs16_y2_c;
3145 c->pix_abs[0][3] = pix_abs16_xy2_c;
3146 c->pix_abs[1][0] = pix_abs8_c;
3147 c->pix_abs[1][1] = pix_abs8_x2_c;
3148 c->pix_abs[1][2] = pix_abs8_y2_c;
3149 c->pix_abs[1][3] = pix_abs8_xy2_c;
3150
3151 #define dspfunc(PFX, IDX, NUM) \
3152 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3153 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3154 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3155 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3156
3157 dspfunc(put, 0, 16);
3158 dspfunc(put_no_rnd, 0, 16);
3159 dspfunc(put, 1, 8);
3160 dspfunc(put_no_rnd, 1, 8);
3161 dspfunc(put, 2, 4);
3162 dspfunc(put, 3, 2);
3163
3164 dspfunc(avg, 0, 16);
3165 dspfunc(avg_no_rnd, 0, 16);
3166 dspfunc(avg, 1, 8);
3167 dspfunc(avg_no_rnd, 1, 8);
3168 dspfunc(avg, 2, 4);
3169 dspfunc(avg, 3, 2);
3170 #undef dspfunc
3171
3172 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3173 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3174
3175 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3176 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3177 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3178 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3179 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3180 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3181 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3182 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3183 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3184
3185 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3186 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3187 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3188 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3189 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3190 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3191 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3192 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3193 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3194
3195 #define dspfunc(PFX, IDX, NUM) \
3196 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3197 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3198 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3199 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3200 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3201 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3202 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3203 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3204 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3205 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3206 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3207 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3208 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3209 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3210 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3211 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3212
3213 dspfunc(put_qpel, 0, 16);
3214 dspfunc(put_no_rnd_qpel, 0, 16);
3215
3216 dspfunc(avg_qpel, 0, 16);
3217 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3218
3219 dspfunc(put_qpel, 1, 8);
3220 dspfunc(put_no_rnd_qpel, 1, 8);
3221
3222 dspfunc(avg_qpel, 1, 8);
3223 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3224
3225 dspfunc(put_h264_qpel, 0, 16);
3226 dspfunc(put_h264_qpel, 1, 8);
3227 dspfunc(put_h264_qpel, 2, 4);
3228 dspfunc(avg_h264_qpel, 0, 16);
3229 dspfunc(avg_h264_qpel, 1, 8);
3230 dspfunc(avg_h264_qpel, 2, 4);
3231
3232 #undef dspfunc
3233 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3234 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3235 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3236 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3237 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3238 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3239
3240 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3241 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3242 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3243 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3244 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3245 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3246 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3247 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3248
3249 #define SET_CMP_FUNC(name) \
3250 c->name[0]= name ## 16_c;\
3251 c->name[1]= name ## 8x8_c;
3252
3253 SET_CMP_FUNC(hadamard8_diff)
3254 c->hadamard8_diff[4]= hadamard8_intra16_c;
3255 SET_CMP_FUNC(dct_sad)
3256 c->sad[0]= pix_abs16_c;
3257 c->sad[1]= pix_abs8_c;
3258 c->sse[0]= sse16_c;
3259 c->sse[1]= sse8_c;
3260 SET_CMP_FUNC(quant_psnr)
3261 SET_CMP_FUNC(rd)
3262 SET_CMP_FUNC(bit)
3263 c->vsad[0]= vsad16_c;
3264 c->vsad[4]= vsad_intra16_c;
3265 c->vsse[0]= vsse16_c;
3266 c->vsse[4]= vsse_intra16_c;
3267
3268 c->add_bytes= add_bytes_c;
3269 c->diff_bytes= diff_bytes_c;
3270 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3271 c->bswap_buf= bswap_buf;
3272
3273 c->h263_h_loop_filter= h263_h_loop_filter_c;
3274 c->h263_v_loop_filter= h263_v_loop_filter_c;
3275
3276 c->try_8x8basis= try_8x8basis_c;
3277 c->add_8x8basis= add_8x8basis_c;
3278
3279 #ifdef HAVE_MMX
3280 dsputil_init_mmx(c, avctx);
3281 #endif
3282 #ifdef ARCH_ARMV4L
3283 dsputil_init_armv4l(c, avctx);
3284 #endif
3285 #ifdef HAVE_MLIB
3286 dsputil_init_mlib(c, avctx);
3287 #endif
3288 #ifdef ARCH_SPARC
3289 dsputil_init_vis(c,avctx);
3290 #endif
3291 #ifdef ARCH_ALPHA
3292 dsputil_init_alpha(c, avctx);
3293 #endif
3294 #ifdef ARCH_POWERPC
3295 dsputil_init_ppc(c, avctx);
3296 #endif
3297 #ifdef HAVE_MMI
3298 dsputil_init_mmi(c, avctx);
3299 #endif
3300 #ifdef ARCH_SH4
3301 dsputil_init_sh4(c,avctx);
3302 #endif
3303
3304 switch(c->idct_permutation_type){
3305 case FF_NO_IDCT_PERM:
3306 for(i=0; i<64; i++)
3307 c->idct_permutation[i]= i;
3308 break;
3309 case FF_LIBMPEG2_IDCT_PERM:
3310 for(i=0; i<64; i++)
3311 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3312 break;
3313 case FF_SIMPLE_IDCT_PERM:
3314 for(i=0; i<64; i++)
3315 c->idct_permutation[i]= simple_mmx_permutation[i];
3316 break;
3317 case FF_TRANSPOSE_IDCT_PERM:
3318 for(i=0; i<64; i++)
3319 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3320 break;
3321 default:
3322 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3323 }
3324 }
3325