sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 *
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20 */
21
22 /**
23 * @file dsputil.c
24 * DSP utils
25 */
26
27 #include "avcodec.h"
28 #include "dsputil.h"
29 #include "mpegvideo.h"
30 #include "simple_idct.h"
31
32
33 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34 uint32_t squareTbl[512];
35
36 const uint8_t ff_zigzag_direct[64] = {
37 0, 1, 8, 16, 9, 2, 3, 10,
38 17, 24, 32, 25, 18, 11, 4, 5,
39 12, 19, 26, 33, 40, 48, 41, 34,
40 27, 20, 13, 6, 7, 14, 21, 28,
41 35, 42, 49, 56, 57, 50, 43, 36,
42 29, 22, 15, 23, 30, 37, 44, 51,
43 58, 59, 52, 45, 38, 31, 39, 46,
44 53, 60, 61, 54, 47, 55, 62, 63
45 };
46
47 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48 uint16_t __align8 inv_zigzag_direct16[64];
49
50 const uint8_t ff_alternate_horizontal_scan[64] = {
51 0, 1, 2, 3, 8, 9, 16, 17,
52 10, 11, 4, 5, 6, 7, 15, 14,
53 13, 12, 19, 18, 24, 25, 32, 33,
54 26, 27, 20, 21, 22, 23, 28, 29,
55 30, 31, 34, 35, 40, 41, 48, 49,
56 42, 43, 36, 37, 38, 39, 44, 45,
57 46, 47, 50, 51, 56, 57, 58, 59,
58 52, 53, 54, 55, 60, 61, 62, 63,
59 };
60
61 const uint8_t ff_alternate_vertical_scan[64] = {
62 0, 8, 16, 24, 1, 9, 2, 10,
63 17, 25, 32, 40, 48, 56, 57, 49,
64 41, 33, 26, 18, 3, 11, 4, 12,
65 19, 27, 34, 42, 50, 58, 35, 43,
66 51, 59, 20, 28, 5, 13, 6, 14,
67 21, 29, 36, 44, 52, 60, 37, 45,
68 53, 61, 22, 30, 7, 15, 23, 31,
69 38, 46, 54, 62, 39, 47, 55, 63,
70 };
71
72 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73 const uint32_t inverse[256]={
74 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
75 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
76 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
77 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
78 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
79 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
80 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
81 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
82 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
83 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
84 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
85 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
86 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
87 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
88 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
89 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
90 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
91 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
92 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
93 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
94 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
95 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
96 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
97 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
98 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
99 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
100 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
101 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
102 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
103 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
104 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
105 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
106 };
107
108 /* Input permutation for the simple_idct_mmx */
109 static const uint8_t simple_mmx_permutation[64]={
110 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
111 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
112 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
113 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
114 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
115 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
116 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
117 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
118 };
119
120 static int pix_sum_c(uint8_t * pix, int line_size)
121 {
122 int s, i, j;
123
124 s = 0;
125 for (i = 0; i < 16; i++) {
126 for (j = 0; j < 16; j += 8) {
127 s += pix[0];
128 s += pix[1];
129 s += pix[2];
130 s += pix[3];
131 s += pix[4];
132 s += pix[5];
133 s += pix[6];
134 s += pix[7];
135 pix += 8;
136 }
137 pix += line_size - 16;
138 }
139 return s;
140 }
141
142 static int pix_norm1_c(uint8_t * pix, int line_size)
143 {
144 int s, i, j;
145 uint32_t *sq = squareTbl + 256;
146
147 s = 0;
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
150 #if 0
151 s += sq[pix[0]];
152 s += sq[pix[1]];
153 s += sq[pix[2]];
154 s += sq[pix[3]];
155 s += sq[pix[4]];
156 s += sq[pix[5]];
157 s += sq[pix[6]];
158 s += sq[pix[7]];
159 #else
160 #if LONG_MAX > 2147483647
161 register uint64_t x=*(uint64_t*)pix;
162 s += sq[x&0xff];
163 s += sq[(x>>8)&0xff];
164 s += sq[(x>>16)&0xff];
165 s += sq[(x>>24)&0xff];
166 s += sq[(x>>32)&0xff];
167 s += sq[(x>>40)&0xff];
168 s += sq[(x>>48)&0xff];
169 s += sq[(x>>56)&0xff];
170 #else
171 register uint32_t x=*(uint32_t*)pix;
172 s += sq[x&0xff];
173 s += sq[(x>>8)&0xff];
174 s += sq[(x>>16)&0xff];
175 s += sq[(x>>24)&0xff];
176 x=*(uint32_t*)(pix+4);
177 s += sq[x&0xff];
178 s += sq[(x>>8)&0xff];
179 s += sq[(x>>16)&0xff];
180 s += sq[(x>>24)&0xff];
181 #endif
182 #endif
183 pix += 8;
184 }
185 pix += line_size - 16;
186 }
187 return s;
188 }
189
190
191 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
192 {
193 int s, i;
194 uint32_t *sq = squareTbl + 256;
195
196 s = 0;
197 for (i = 0; i < 8; i++) {
198 s += sq[pix1[0] - pix2[0]];
199 s += sq[pix1[1] - pix2[1]];
200 s += sq[pix1[2] - pix2[2]];
201 s += sq[pix1[3] - pix2[3]];
202 s += sq[pix1[4] - pix2[4]];
203 s += sq[pix1[5] - pix2[5]];
204 s += sq[pix1[6] - pix2[6]];
205 s += sq[pix1[7] - pix2[7]];
206 pix1 += line_size;
207 pix2 += line_size;
208 }
209 return s;
210 }
211
212 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
213 {
214 int s, i;
215 uint32_t *sq = squareTbl + 256;
216
217 s = 0;
218 for (i = 0; i < 16; i++) {
219 s += sq[pix1[ 0] - pix2[ 0]];
220 s += sq[pix1[ 1] - pix2[ 1]];
221 s += sq[pix1[ 2] - pix2[ 2]];
222 s += sq[pix1[ 3] - pix2[ 3]];
223 s += sq[pix1[ 4] - pix2[ 4]];
224 s += sq[pix1[ 5] - pix2[ 5]];
225 s += sq[pix1[ 6] - pix2[ 6]];
226 s += sq[pix1[ 7] - pix2[ 7]];
227 s += sq[pix1[ 8] - pix2[ 8]];
228 s += sq[pix1[ 9] - pix2[ 9]];
229 s += sq[pix1[10] - pix2[10]];
230 s += sq[pix1[11] - pix2[11]];
231 s += sq[pix1[12] - pix2[12]];
232 s += sq[pix1[13] - pix2[13]];
233 s += sq[pix1[14] - pix2[14]];
234 s += sq[pix1[15] - pix2[15]];
235
236 pix1 += line_size;
237 pix2 += line_size;
238 }
239 return s;
240 }
241
242 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
243 {
244 int i;
245
246 /* read the pixels */
247 for(i=0;i<8;i++) {
248 block[0] = pixels[0];
249 block[1] = pixels[1];
250 block[2] = pixels[2];
251 block[3] = pixels[3];
252 block[4] = pixels[4];
253 block[5] = pixels[5];
254 block[6] = pixels[6];
255 block[7] = pixels[7];
256 pixels += line_size;
257 block += 8;
258 }
259 }
260
261 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
262 const uint8_t *s2, int stride){
263 int i;
264
265 /* read the pixels */
266 for(i=0;i<8;i++) {
267 block[0] = s1[0] - s2[0];
268 block[1] = s1[1] - s2[1];
269 block[2] = s1[2] - s2[2];
270 block[3] = s1[3] - s2[3];
271 block[4] = s1[4] - s2[4];
272 block[5] = s1[5] - s2[5];
273 block[6] = s1[6] - s2[6];
274 block[7] = s1[7] - s2[7];
275 s1 += stride;
276 s2 += stride;
277 block += 8;
278 }
279 }
280
281
282 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
283 int line_size)
284 {
285 int i;
286 uint8_t *cm = cropTbl + MAX_NEG_CROP;
287
288 /* read the pixels */
289 for(i=0;i<8;i++) {
290 pixels[0] = cm[block[0]];
291 pixels[1] = cm[block[1]];
292 pixels[2] = cm[block[2]];
293 pixels[3] = cm[block[3]];
294 pixels[4] = cm[block[4]];
295 pixels[5] = cm[block[5]];
296 pixels[6] = cm[block[6]];
297 pixels[7] = cm[block[7]];
298
299 pixels += line_size;
300 block += 8;
301 }
302 }
303
304 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
305 int line_size)
306 {
307 int i;
308 uint8_t *cm = cropTbl + MAX_NEG_CROP;
309
310 /* read the pixels */
311 for(i=0;i<8;i++) {
312 pixels[0] = cm[pixels[0] + block[0]];
313 pixels[1] = cm[pixels[1] + block[1]];
314 pixels[2] = cm[pixels[2] + block[2]];
315 pixels[3] = cm[pixels[3] + block[3]];
316 pixels[4] = cm[pixels[4] + block[4]];
317 pixels[5] = cm[pixels[5] + block[5]];
318 pixels[6] = cm[pixels[6] + block[6]];
319 pixels[7] = cm[pixels[7] + block[7]];
320 pixels += line_size;
321 block += 8;
322 }
323 }
324 #if 0
325
326 #define PIXOP2(OPNAME, OP) \
327 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
328 {\
329 int i;\
330 for(i=0; i<h; i++){\
331 OP(*((uint64_t*)block), LD64(pixels));\
332 pixels+=line_size;\
333 block +=line_size;\
334 }\
335 }\
336 \
337 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
338 {\
339 int i;\
340 for(i=0; i<h; i++){\
341 const uint64_t a= LD64(pixels );\
342 const uint64_t b= LD64(pixels+1);\
343 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
344 pixels+=line_size;\
345 block +=line_size;\
346 }\
347 }\
348 \
349 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
350 {\
351 int i;\
352 for(i=0; i<h; i++){\
353 const uint64_t a= LD64(pixels );\
354 const uint64_t b= LD64(pixels+1);\
355 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
356 pixels+=line_size;\
357 block +=line_size;\
358 }\
359 }\
360 \
361 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
362 {\
363 int i;\
364 for(i=0; i<h; i++){\
365 const uint64_t a= LD64(pixels );\
366 const uint64_t b= LD64(pixels+line_size);\
367 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
368 pixels+=line_size;\
369 block +=line_size;\
370 }\
371 }\
372 \
373 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
374 {\
375 int i;\
376 for(i=0; i<h; i++){\
377 const uint64_t a= LD64(pixels );\
378 const uint64_t b= LD64(pixels+line_size);\
379 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
380 pixels+=line_size;\
381 block +=line_size;\
382 }\
383 }\
384 \
385 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
386 {\
387 int i;\
388 const uint64_t a= LD64(pixels );\
389 const uint64_t b= LD64(pixels+1);\
390 uint64_t l0= (a&0x0303030303030303ULL)\
391 + (b&0x0303030303030303ULL)\
392 + 0x0202020202020202ULL;\
393 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
394 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
395 uint64_t l1,h1;\
396 \
397 pixels+=line_size;\
398 for(i=0; i<h; i+=2){\
399 uint64_t a= LD64(pixels );\
400 uint64_t b= LD64(pixels+1);\
401 l1= (a&0x0303030303030303ULL)\
402 + (b&0x0303030303030303ULL);\
403 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
404 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
405 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
406 pixels+=line_size;\
407 block +=line_size;\
408 a= LD64(pixels );\
409 b= LD64(pixels+1);\
410 l0= (a&0x0303030303030303ULL)\
411 + (b&0x0303030303030303ULL)\
412 + 0x0202020202020202ULL;\
413 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
414 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
415 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
416 pixels+=line_size;\
417 block +=line_size;\
418 }\
419 }\
420 \
421 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
422 {\
423 int i;\
424 const uint64_t a= LD64(pixels );\
425 const uint64_t b= LD64(pixels+1);\
426 uint64_t l0= (a&0x0303030303030303ULL)\
427 + (b&0x0303030303030303ULL)\
428 + 0x0101010101010101ULL;\
429 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
430 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
431 uint64_t l1,h1;\
432 \
433 pixels+=line_size;\
434 for(i=0; i<h; i+=2){\
435 uint64_t a= LD64(pixels );\
436 uint64_t b= LD64(pixels+1);\
437 l1= (a&0x0303030303030303ULL)\
438 + (b&0x0303030303030303ULL);\
439 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
440 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
441 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
442 pixels+=line_size;\
443 block +=line_size;\
444 a= LD64(pixels );\
445 b= LD64(pixels+1);\
446 l0= (a&0x0303030303030303ULL)\
447 + (b&0x0303030303030303ULL)\
448 + 0x0101010101010101ULL;\
449 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
450 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
451 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
452 pixels+=line_size;\
453 block +=line_size;\
454 }\
455 }\
456 \
457 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
458 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
459 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
460 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
461 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
462 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
463 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
464
465 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
466 #else // 64 bit variant
467
468 #define PIXOP2(OPNAME, OP) \
469 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
470 int i;\
471 for(i=0; i<h; i++){\
472 OP(*((uint32_t*)(block )), LD32(pixels ));\
473 pixels+=line_size;\
474 block +=line_size;\
475 }\
476 }\
477 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
478 int i;\
479 for(i=0; i<h; i++){\
480 OP(*((uint32_t*)(block )), LD32(pixels ));\
481 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
482 pixels+=line_size;\
483 block +=line_size;\
484 }\
485 }\
486 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
487 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
488 }\
489 \
490 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
491 int src_stride1, int src_stride2, int h){\
492 int i;\
493 for(i=0; i<h; i++){\
494 uint32_t a,b;\
495 a= LD32(&src1[i*src_stride1 ]);\
496 b= LD32(&src2[i*src_stride2 ]);\
497 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
498 a= LD32(&src1[i*src_stride1+4]);\
499 b= LD32(&src2[i*src_stride2+4]);\
500 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
501 }\
502 }\
503 \
504 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
505 int src_stride1, int src_stride2, int h){\
506 int i;\
507 for(i=0; i<h; i++){\
508 uint32_t a,b;\
509 a= LD32(&src1[i*src_stride1 ]);\
510 b= LD32(&src2[i*src_stride2 ]);\
511 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
512 a= LD32(&src1[i*src_stride1+4]);\
513 b= LD32(&src2[i*src_stride2+4]);\
514 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
515 }\
516 }\
517 \
518 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
519 int src_stride1, int src_stride2, int h){\
520 int i;\
521 for(i=0; i<h; i++){\
522 uint32_t a,b;\
523 a= LD32(&src1[i*src_stride1 ]);\
524 b= LD32(&src2[i*src_stride2 ]);\
525 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
526 }\
527 }\
528 \
529 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530 int src_stride1, int src_stride2, int h){\
531 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
532 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
533 }\
534 \
535 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
536 int src_stride1, int src_stride2, int h){\
537 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
538 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
539 }\
540 \
541 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
542 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
543 }\
544 \
545 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
546 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
547 }\
548 \
549 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
550 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
551 }\
552 \
553 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
554 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
555 }\
556 \
557 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
558 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
559 int i;\
560 for(i=0; i<h; i++){\
561 uint32_t a, b, c, d, l0, l1, h0, h1;\
562 a= LD32(&src1[i*src_stride1]);\
563 b= LD32(&src2[i*src_stride2]);\
564 c= LD32(&src3[i*src_stride3]);\
565 d= LD32(&src4[i*src_stride4]);\
566 l0= (a&0x03030303UL)\
567 + (b&0x03030303UL)\
568 + 0x02020202UL;\
569 h0= ((a&0xFCFCFCFCUL)>>2)\
570 + ((b&0xFCFCFCFCUL)>>2);\
571 l1= (c&0x03030303UL)\
572 + (d&0x03030303UL);\
573 h1= ((c&0xFCFCFCFCUL)>>2)\
574 + ((d&0xFCFCFCFCUL)>>2);\
575 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
576 a= LD32(&src1[i*src_stride1+4]);\
577 b= LD32(&src2[i*src_stride2+4]);\
578 c= LD32(&src3[i*src_stride3+4]);\
579 d= LD32(&src4[i*src_stride4+4]);\
580 l0= (a&0x03030303UL)\
581 + (b&0x03030303UL)\
582 + 0x02020202UL;\
583 h0= ((a&0xFCFCFCFCUL)>>2)\
584 + ((b&0xFCFCFCFCUL)>>2);\
585 l1= (c&0x03030303UL)\
586 + (d&0x03030303UL);\
587 h1= ((c&0xFCFCFCFCUL)>>2)\
588 + ((d&0xFCFCFCFCUL)>>2);\
589 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
590 }\
591 }\
592 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
593 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
594 int i;\
595 for(i=0; i<h; i++){\
596 uint32_t a, b, c, d, l0, l1, h0, h1;\
597 a= LD32(&src1[i*src_stride1]);\
598 b= LD32(&src2[i*src_stride2]);\
599 c= LD32(&src3[i*src_stride3]);\
600 d= LD32(&src4[i*src_stride4]);\
601 l0= (a&0x03030303UL)\
602 + (b&0x03030303UL)\
603 + 0x01010101UL;\
604 h0= ((a&0xFCFCFCFCUL)>>2)\
605 + ((b&0xFCFCFCFCUL)>>2);\
606 l1= (c&0x03030303UL)\
607 + (d&0x03030303UL);\
608 h1= ((c&0xFCFCFCFCUL)>>2)\
609 + ((d&0xFCFCFCFCUL)>>2);\
610 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
611 a= LD32(&src1[i*src_stride1+4]);\
612 b= LD32(&src2[i*src_stride2+4]);\
613 c= LD32(&src3[i*src_stride3+4]);\
614 d= LD32(&src4[i*src_stride4+4]);\
615 l0= (a&0x03030303UL)\
616 + (b&0x03030303UL)\
617 + 0x01010101UL;\
618 h0= ((a&0xFCFCFCFCUL)>>2)\
619 + ((b&0xFCFCFCFCUL)>>2);\
620 l1= (c&0x03030303UL)\
621 + (d&0x03030303UL);\
622 h1= ((c&0xFCFCFCFCUL)>>2)\
623 + ((d&0xFCFCFCFCUL)>>2);\
624 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
625 }\
626 }\
627 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
628 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
629 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
630 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
631 }\
632 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
633 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
634 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
635 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
636 }\
637 \
638 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
639 {\
640 int j;\
641 for(j=0; j<2; j++){\
642 int i;\
643 const uint32_t a= LD32(pixels );\
644 const uint32_t b= LD32(pixels+1);\
645 uint32_t l0= (a&0x03030303UL)\
646 + (b&0x03030303UL)\
647 + 0x02020202UL;\
648 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
649 + ((b&0xFCFCFCFCUL)>>2);\
650 uint32_t l1,h1;\
651 \
652 pixels+=line_size;\
653 for(i=0; i<h; i+=2){\
654 uint32_t a= LD32(pixels );\
655 uint32_t b= LD32(pixels+1);\
656 l1= (a&0x03030303UL)\
657 + (b&0x03030303UL);\
658 h1= ((a&0xFCFCFCFCUL)>>2)\
659 + ((b&0xFCFCFCFCUL)>>2);\
660 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
661 pixels+=line_size;\
662 block +=line_size;\
663 a= LD32(pixels );\
664 b= LD32(pixels+1);\
665 l0= (a&0x03030303UL)\
666 + (b&0x03030303UL)\
667 + 0x02020202UL;\
668 h0= ((a&0xFCFCFCFCUL)>>2)\
669 + ((b&0xFCFCFCFCUL)>>2);\
670 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
671 pixels+=line_size;\
672 block +=line_size;\
673 }\
674 pixels+=4-line_size*(h+1);\
675 block +=4-line_size*h;\
676 }\
677 }\
678 \
679 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
680 {\
681 int j;\
682 for(j=0; j<2; j++){\
683 int i;\
684 const uint32_t a= LD32(pixels );\
685 const uint32_t b= LD32(pixels+1);\
686 uint32_t l0= (a&0x03030303UL)\
687 + (b&0x03030303UL)\
688 + 0x01010101UL;\
689 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
690 + ((b&0xFCFCFCFCUL)>>2);\
691 uint32_t l1,h1;\
692 \
693 pixels+=line_size;\
694 for(i=0; i<h; i+=2){\
695 uint32_t a= LD32(pixels );\
696 uint32_t b= LD32(pixels+1);\
697 l1= (a&0x03030303UL)\
698 + (b&0x03030303UL);\
699 h1= ((a&0xFCFCFCFCUL)>>2)\
700 + ((b&0xFCFCFCFCUL)>>2);\
701 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
702 pixels+=line_size;\
703 block +=line_size;\
704 a= LD32(pixels );\
705 b= LD32(pixels+1);\
706 l0= (a&0x03030303UL)\
707 + (b&0x03030303UL)\
708 + 0x01010101UL;\
709 h0= ((a&0xFCFCFCFCUL)>>2)\
710 + ((b&0xFCFCFCFCUL)>>2);\
711 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
712 pixels+=line_size;\
713 block +=line_size;\
714 }\
715 pixels+=4-line_size*(h+1);\
716 block +=4-line_size*h;\
717 }\
718 }\
719 \
720 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
721 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
722 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
723 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
724 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
725 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
726 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
727 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
728
729 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
730 #endif
731 #define op_put(a, b) a = b
732
733 PIXOP2(avg, op_avg)
734 PIXOP2(put, op_put)
735 #undef op_avg
736 #undef op_put
737
738 #define avg2(a,b) ((a+b+1)>>1)
739 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
740
741
742 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
743 {
744 const int A=(16-x16)*(16-y16);
745 const int B=( x16)*(16-y16);
746 const int C=(16-x16)*( y16);
747 const int D=( x16)*( y16);
748 int i;
749
750 for(i=0; i<h; i++)
751 {
752 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
753 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
754 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
755 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
756 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
757 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
758 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
759 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
760 dst+= stride;
761 src+= stride;
762 }
763 }
764
765 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
766 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
767 {
768 int y, vx, vy;
769 const int s= 1<<shift;
770
771 width--;
772 height--;
773
774 for(y=0; y<h; y++){
775 int x;
776
777 vx= ox;
778 vy= oy;
779 for(x=0; x<8; x++){ //XXX FIXME optimize
780 int src_x, src_y, frac_x, frac_y, index;
781
782 src_x= vx>>16;
783 src_y= vy>>16;
784 frac_x= src_x&(s-1);
785 frac_y= src_y&(s-1);
786 src_x>>=shift;
787 src_y>>=shift;
788
789 if((unsigned)src_x < width){
790 if((unsigned)src_y < height){
791 index= src_x + src_y*stride;
792 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
793 + src[index +1]* frac_x )*(s-frac_y)
794 + ( src[index+stride ]*(s-frac_x)
795 + src[index+stride+1]* frac_x )* frac_y
796 + r)>>(shift*2);
797 }else{
798 index= src_x + clip(src_y, 0, height)*stride;
799 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
800 + src[index +1]* frac_x )*s
801 + r)>>(shift*2);
802 }
803 }else{
804 if((unsigned)src_y < height){
805 index= clip(src_x, 0, width) + src_y*stride;
806 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
807 + src[index+stride ]* frac_y )*s
808 + r)>>(shift*2);
809 }else{
810 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
811 dst[y*stride + x]= src[index ];
812 }
813 }
814
815 vx+= dxx;
816 vy+= dyx;
817 }
818 ox += dxy;
819 oy += dyy;
820 }
821 }
822 #define H264_CHROMA_MC(OPNAME, OP)\
823 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
824 const int A=(8-x)*(8-y);\
825 const int B=( x)*(8-y);\
826 const int C=(8-x)*( y);\
827 const int D=( x)*( y);\
828 int i;\
829 \
830 assert(x<8 && y<8 && x>=0 && y>=0);\
831 \
832 for(i=0; i<h; i++)\
833 {\
834 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
835 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
836 dst+= stride;\
837 src+= stride;\
838 }\
839 }\
840 \
841 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
842 const int A=(8-x)*(8-y);\
843 const int B=( x)*(8-y);\
844 const int C=(8-x)*( y);\
845 const int D=( x)*( y);\
846 int i;\
847 \
848 assert(x<8 && y<8 && x>=0 && y>=0);\
849 \
850 for(i=0; i<h; i++)\
851 {\
852 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
853 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
854 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
855 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
856 dst+= stride;\
857 src+= stride;\
858 }\
859 }\
860 \
861 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
862 const int A=(8-x)*(8-y);\
863 const int B=( x)*(8-y);\
864 const int C=(8-x)*( y);\
865 const int D=( x)*( y);\
866 int i;\
867 \
868 assert(x<8 && y<8 && x>=0 && y>=0);\
869 \
870 for(i=0; i<h; i++)\
871 {\
872 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
873 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
874 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
875 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
876 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
877 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
878 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
879 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
880 dst+= stride;\
881 src+= stride;\
882 }\
883 }
884
885 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
886 #define op_put(a, b) a = (((b) + 32)>>6)
887
888 H264_CHROMA_MC(put_ , op_put)
889 H264_CHROMA_MC(avg_ , op_avg)
890 #undef op_avg
891 #undef op_put
892
893 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
894 {
895 int i;
896 for(i=0; i<h; i++)
897 {
898 ST32(dst , LD32(src ));
899 dst+=dstStride;
900 src+=srcStride;
901 }
902 }
903
904 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
905 {
906 int i;
907 for(i=0; i<h; i++)
908 {
909 ST32(dst , LD32(src ));
910 ST32(dst+4 , LD32(src+4 ));
911 dst+=dstStride;
912 src+=srcStride;
913 }
914 }
915
916 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
917 {
918 int i;
919 for(i=0; i<h; i++)
920 {
921 ST32(dst , LD32(src ));
922 ST32(dst+4 , LD32(src+4 ));
923 ST32(dst+8 , LD32(src+8 ));
924 ST32(dst+12, LD32(src+12));
925 dst+=dstStride;
926 src+=srcStride;
927 }
928 }
929
930 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
931 {
932 int i;
933 for(i=0; i<h; i++)
934 {
935 ST32(dst , LD32(src ));
936 ST32(dst+4 , LD32(src+4 ));
937 ST32(dst+8 , LD32(src+8 ));
938 ST32(dst+12, LD32(src+12));
939 dst[16]= src[16];
940 dst+=dstStride;
941 src+=srcStride;
942 }
943 }
944
945 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
946 {
947 int i;
948 for(i=0; i<h; i++)
949 {
950 ST32(dst , LD32(src ));
951 ST32(dst+4 , LD32(src+4 ));
952 dst[8]= src[8];
953 dst+=dstStride;
954 src+=srcStride;
955 }
956 }
957
958
959 #define QPEL_MC(r, OPNAME, RND, OP) \
960 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
961 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
962 int i;\
963 for(i=0; i<h; i++)\
964 {\
965 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
966 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
967 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
968 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
969 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
970 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
971 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
972 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
973 dst+=dstStride;\
974 src+=srcStride;\
975 }\
976 }\
977 \
978 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
979 const int w=8;\
980 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
981 int i;\
982 for(i=0; i<w; i++)\
983 {\
984 const int src0= src[0*srcStride];\
985 const int src1= src[1*srcStride];\
986 const int src2= src[2*srcStride];\
987 const int src3= src[3*srcStride];\
988 const int src4= src[4*srcStride];\
989 const int src5= src[5*srcStride];\
990 const int src6= src[6*srcStride];\
991 const int src7= src[7*srcStride];\
992 const int src8= src[8*srcStride];\
993 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
994 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
995 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
996 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
997 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
998 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
999 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1000 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1001 dst++;\
1002 src++;\
1003 }\
1004 }\
1005 \
1006 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1007 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1008 int i;\
1009 \
1010 for(i=0; i<h; i++)\
1011 {\
1012 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1013 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1014 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1015 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1016 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1017 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1018 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1019 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1020 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1021 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1022 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1023 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1024 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1025 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1026 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1027 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1028 dst+=dstStride;\
1029 src+=srcStride;\
1030 }\
1031 }\
1032 \
1033 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1034 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1035 int i;\
1036 const int w=16;\
1037 for(i=0; i<w; i++)\
1038 {\
1039 const int src0= src[0*srcStride];\
1040 const int src1= src[1*srcStride];\
1041 const int src2= src[2*srcStride];\
1042 const int src3= src[3*srcStride];\
1043 const int src4= src[4*srcStride];\
1044 const int src5= src[5*srcStride];\
1045 const int src6= src[6*srcStride];\
1046 const int src7= src[7*srcStride];\
1047 const int src8= src[8*srcStride];\
1048 const int src9= src[9*srcStride];\
1049 const int src10= src[10*srcStride];\
1050 const int src11= src[11*srcStride];\
1051 const int src12= src[12*srcStride];\
1052 const int src13= src[13*srcStride];\
1053 const int src14= src[14*srcStride];\
1054 const int src15= src[15*srcStride];\
1055 const int src16= src[16*srcStride];\
1056 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1057 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1058 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1059 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1060 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1061 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1062 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1063 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1064 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1065 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1066 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1067 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1068 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1069 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1070 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1071 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1072 dst++;\
1073 src++;\
1074 }\
1075 }\
1076 \
1077 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1078 OPNAME ## pixels8_c(dst, src, stride, 8);\
1079 }\
1080 \
1081 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1082 uint8_t half[64];\
1083 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1084 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1085 }\
1086 \
1087 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1088 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1089 }\
1090 \
1091 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1092 uint8_t half[64];\
1093 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1094 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1095 }\
1096 \
1097 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1098 uint8_t full[16*9];\
1099 uint8_t half[64];\
1100 copy_block9(full, src, 16, stride, 9);\
1101 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1102 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1103 }\
1104 \
1105 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1106 uint8_t full[16*9];\
1107 copy_block9(full, src, 16, stride, 9);\
1108 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1109 }\
1110 \
1111 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1112 uint8_t full[16*9];\
1113 uint8_t half[64];\
1114 copy_block9(full, src, 16, stride, 9);\
1115 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1116 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1117 }\
1118 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1119 uint8_t full[16*9];\
1120 uint8_t halfH[72];\
1121 uint8_t halfV[64];\
1122 uint8_t halfHV[64];\
1123 copy_block9(full, src, 16, stride, 9);\
1124 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1125 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1126 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1127 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1128 }\
1129 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1130 uint8_t full[16*9];\
1131 uint8_t halfH[72];\
1132 uint8_t halfHV[64];\
1133 copy_block9(full, src, 16, stride, 9);\
1134 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1135 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1136 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1137 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1138 }\
1139 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1140 uint8_t full[16*9];\
1141 uint8_t halfH[72];\
1142 uint8_t halfV[64];\
1143 uint8_t halfHV[64];\
1144 copy_block9(full, src, 16, stride, 9);\
1145 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1146 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1147 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1148 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1149 }\
1150 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1151 uint8_t full[16*9];\
1152 uint8_t halfH[72];\
1153 uint8_t halfHV[64];\
1154 copy_block9(full, src, 16, stride, 9);\
1155 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1156 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1157 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1158 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1159 }\
1160 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1161 uint8_t full[16*9];\
1162 uint8_t halfH[72];\
1163 uint8_t halfV[64];\
1164 uint8_t halfHV[64];\
1165 copy_block9(full, src, 16, stride, 9);\
1166 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1167 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1168 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1169 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1170 }\
1171 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1172 uint8_t full[16*9];\
1173 uint8_t halfH[72];\
1174 uint8_t halfHV[64];\
1175 copy_block9(full, src, 16, stride, 9);\
1176 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1177 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1178 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1179 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1180 }\
1181 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1182 uint8_t full[16*9];\
1183 uint8_t halfH[72];\
1184 uint8_t halfV[64];\
1185 uint8_t halfHV[64];\
1186 copy_block9(full, src, 16, stride, 9);\
1187 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1188 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1189 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1190 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1191 }\
1192 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1193 uint8_t full[16*9];\
1194 uint8_t halfH[72];\
1195 uint8_t halfHV[64];\
1196 copy_block9(full, src, 16, stride, 9);\
1197 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1198 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1199 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1200 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1201 }\
1202 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1203 uint8_t halfH[72];\
1204 uint8_t halfHV[64];\
1205 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1206 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1207 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1208 }\
1209 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1210 uint8_t halfH[72];\
1211 uint8_t halfHV[64];\
1212 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1213 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1214 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1215 }\
1216 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1217 uint8_t full[16*9];\
1218 uint8_t halfH[72];\
1219 uint8_t halfV[64];\
1220 uint8_t halfHV[64];\
1221 copy_block9(full, src, 16, stride, 9);\
1222 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1223 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1224 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1225 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1226 }\
1227 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1228 uint8_t full[16*9];\
1229 uint8_t halfH[72];\
1230 copy_block9(full, src, 16, stride, 9);\
1231 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1232 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1233 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1234 }\
1235 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1236 uint8_t full[16*9];\
1237 uint8_t halfH[72];\
1238 uint8_t halfV[64];\
1239 uint8_t halfHV[64];\
1240 copy_block9(full, src, 16, stride, 9);\
1241 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1242 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1243 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1244 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1245 }\
1246 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1247 uint8_t full[16*9];\
1248 uint8_t halfH[72];\
1249 copy_block9(full, src, 16, stride, 9);\
1250 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1251 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1252 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1253 }\
1254 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1255 uint8_t halfH[72];\
1256 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1257 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1258 }\
1259 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1260 OPNAME ## pixels16_c(dst, src, stride, 16);\
1261 }\
1262 \
1263 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1264 uint8_t half[256];\
1265 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1266 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1267 }\
1268 \
1269 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1270 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1271 }\
1272 \
1273 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1274 uint8_t half[256];\
1275 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1276 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1277 }\
1278 \
1279 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1280 uint8_t full[24*17];\
1281 uint8_t half[256];\
1282 copy_block17(full, src, 24, stride, 17);\
1283 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1284 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1285 }\
1286 \
1287 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1288 uint8_t full[24*17];\
1289 copy_block17(full, src, 24, stride, 17);\
1290 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1291 }\
1292 \
1293 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1294 uint8_t full[24*17];\
1295 uint8_t half[256];\
1296 copy_block17(full, src, 24, stride, 17);\
1297 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1298 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1299 }\
1300 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1301 uint8_t full[24*17];\
1302 uint8_t halfH[272];\
1303 uint8_t halfV[256];\
1304 uint8_t halfHV[256];\
1305 copy_block17(full, src, 24, stride, 17);\
1306 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1307 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1308 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1309 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1310 }\
1311 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1312 uint8_t full[24*17];\
1313 uint8_t halfH[272];\
1314 uint8_t halfHV[256];\
1315 copy_block17(full, src, 24, stride, 17);\
1316 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1317 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1318 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1319 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1320 }\
1321 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1322 uint8_t full[24*17];\
1323 uint8_t halfH[272];\
1324 uint8_t halfV[256];\
1325 uint8_t halfHV[256];\
1326 copy_block17(full, src, 24, stride, 17);\
1327 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1328 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1329 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1330 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1331 }\
1332 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1333 uint8_t full[24*17];\
1334 uint8_t halfH[272];\
1335 uint8_t halfHV[256];\
1336 copy_block17(full, src, 24, stride, 17);\
1337 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1338 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1339 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1340 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1341 }\
1342 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1343 uint8_t full[24*17];\
1344 uint8_t halfH[272];\
1345 uint8_t halfV[256];\
1346 uint8_t halfHV[256];\
1347 copy_block17(full, src, 24, stride, 17);\
1348 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1349 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1350 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1351 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1352 }\
1353 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1354 uint8_t full[24*17];\
1355 uint8_t halfH[272];\
1356 uint8_t halfHV[256];\
1357 copy_block17(full, src, 24, stride, 17);\
1358 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1359 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1360 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1361 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1362 }\
1363 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1364 uint8_t full[24*17];\
1365 uint8_t halfH[272];\
1366 uint8_t halfV[256];\
1367 uint8_t halfHV[256];\
1368 copy_block17(full, src, 24, stride, 17);\
1369 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1370 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1371 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1372 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1373 }\
1374 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1375 uint8_t full[24*17];\
1376 uint8_t halfH[272];\
1377 uint8_t halfHV[256];\
1378 copy_block17(full, src, 24, stride, 17);\
1379 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1380 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1381 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1382 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1383 }\
1384 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1385 uint8_t halfH[272];\
1386 uint8_t halfHV[256];\
1387 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1388 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1389 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1390 }\
1391 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1392 uint8_t halfH[272];\
1393 uint8_t halfHV[256];\
1394 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1395 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1396 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1397 }\
1398 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1399 uint8_t full[24*17];\
1400 uint8_t halfH[272];\
1401 uint8_t halfV[256];\
1402 uint8_t halfHV[256];\
1403 copy_block17(full, src, 24, stride, 17);\
1404 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1405 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1406 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1407 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1408 }\
1409 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1410 uint8_t full[24*17];\
1411 uint8_t halfH[272];\
1412 copy_block17(full, src, 24, stride, 17);\
1413 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1414 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1415 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1416 }\
1417 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1418 uint8_t full[24*17];\
1419 uint8_t halfH[272];\
1420 uint8_t halfV[256];\
1421 uint8_t halfHV[256];\
1422 copy_block17(full, src, 24, stride, 17);\
1423 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1424 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1425 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1426 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1427 }\
1428 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1429 uint8_t full[24*17];\
1430 uint8_t halfH[272];\
1431 copy_block17(full, src, 24, stride, 17);\
1432 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1433 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1434 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1435 }\
1436 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1437 uint8_t halfH[272];\
1438 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1439 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1440 }
1441
1442 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1443 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1444 #define op_put(a, b) a = cm[((b) + 16)>>5]
1445 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1446
1447 QPEL_MC(0, put_ , _ , op_put)
1448 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1449 QPEL_MC(0, avg_ , _ , op_avg)
1450 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1451 #undef op_avg
1452 #undef op_avg_no_rnd
1453 #undef op_put
1454 #undef op_put_no_rnd
1455
1456 #if 1
1457 #define H264_LOWPASS(OPNAME, OP, OP2) \
1458 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1459 const int h=4;\
1460 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1461 int i;\
1462 for(i=0; i<h; i++)\
1463 {\
1464 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1465 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1466 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1467 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1468 dst+=dstStride;\
1469 src+=srcStride;\
1470 }\
1471 }\
1472 \
1473 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1474 const int w=4;\
1475 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1476 int i;\
1477 for(i=0; i<w; i++)\
1478 {\
1479 const int srcB= src[-2*srcStride];\
1480 const int srcA= src[-1*srcStride];\
1481 const int src0= src[0 *srcStride];\
1482 const int src1= src[1 *srcStride];\
1483 const int src2= src[2 *srcStride];\
1484 const int src3= src[3 *srcStride];\
1485 const int src4= src[4 *srcStride];\
1486 const int src5= src[5 *srcStride];\
1487 const int src6= src[6 *srcStride];\
1488 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1489 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1490 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1491 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1492 dst++;\
1493 src++;\
1494 }\
1495 }\
1496 \
1497 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1498 const int h=4;\
1499 const int w=4;\
1500 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1501 int i;\
1502 src -= 2*srcStride;\
1503 for(i=0; i<h+5; i++)\
1504 {\
1505 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1506 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1507 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1508 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1509 tmp+=tmpStride;\
1510 src+=srcStride;\
1511 }\
1512 tmp -= tmpStride*(h+5-2);\
1513 for(i=0; i<w; i++)\
1514 {\
1515 const int tmpB= tmp[-2*tmpStride];\
1516 const int tmpA= tmp[-1*tmpStride];\
1517 const int tmp0= tmp[0 *tmpStride];\
1518 const int tmp1= tmp[1 *tmpStride];\
1519 const int tmp2= tmp[2 *tmpStride];\
1520 const int tmp3= tmp[3 *tmpStride];\
1521 const int tmp4= tmp[4 *tmpStride];\
1522 const int tmp5= tmp[5 *tmpStride];\
1523 const int tmp6= tmp[6 *tmpStride];\
1524 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1525 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1526 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1527 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1528 dst++;\
1529 tmp++;\
1530 }\
1531 }\
1532 \
1533 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1534 const int h=8;\
1535 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1536 int i;\
1537 for(i=0; i<h; i++)\
1538 {\
1539 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1540 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1541 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1542 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1543 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1544 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1545 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1546 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1547 dst+=dstStride;\
1548 src+=srcStride;\
1549 }\
1550 }\
1551 \
1552 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1553 const int w=8;\
1554 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1555 int i;\
1556 for(i=0; i<w; i++)\
1557 {\
1558 const int srcB= src[-2*srcStride];\
1559 const int srcA= src[-1*srcStride];\
1560 const int src0= src[0 *srcStride];\
1561 const int src1= src[1 *srcStride];\
1562 const int src2= src[2 *srcStride];\
1563 const int src3= src[3 *srcStride];\
1564 const int src4= src[4 *srcStride];\
1565 const int src5= src[5 *srcStride];\
1566 const int src6= src[6 *srcStride];\
1567 const int src7= src[7 *srcStride];\
1568 const int src8= src[8 *srcStride];\
1569 const int src9= src[9 *srcStride];\
1570 const int src10=src[10*srcStride];\
1571 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1572 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1573 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1574 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1575 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1576 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1577 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1578 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1579 dst++;\
1580 src++;\
1581 }\
1582 }\
1583 \
1584 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1585 const int h=8;\
1586 const int w=8;\
1587 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1588 int i;\
1589 src -= 2*srcStride;\
1590 for(i=0; i<h+5; i++)\
1591 {\
1592 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1593 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1594 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1595 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1596 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1597 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1598 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1599 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1600 tmp+=tmpStride;\
1601 src+=srcStride;\
1602 }\
1603 tmp -= tmpStride*(h+5-2);\
1604 for(i=0; i<w; i++)\
1605 {\
1606 const int tmpB= tmp[-2*tmpStride];\
1607 const int tmpA= tmp[-1*tmpStride];\
1608 const int tmp0= tmp[0 *tmpStride];\
1609 const int tmp1= tmp[1 *tmpStride];\
1610 const int tmp2= tmp[2 *tmpStride];\
1611 const int tmp3= tmp[3 *tmpStride];\
1612 const int tmp4= tmp[4 *tmpStride];\
1613 const int tmp5= tmp[5 *tmpStride];\
1614 const int tmp6= tmp[6 *tmpStride];\
1615 const int tmp7= tmp[7 *tmpStride];\
1616 const int tmp8= tmp[8 *tmpStride];\
1617 const int tmp9= tmp[9 *tmpStride];\
1618 const int tmp10=tmp[10*tmpStride];\
1619 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1620 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1621 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1622 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1623 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1624 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1625 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1626 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1627 dst++;\
1628 tmp++;\
1629 }\
1630 }\
1631 \
1632 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1633 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1634 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1635 src += 8*srcStride;\
1636 dst += 8*dstStride;\
1637 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1638 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1639 }\
1640 \
1641 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1642 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1643 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1644 src += 8*srcStride;\
1645 dst += 8*dstStride;\
1646 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1647 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1648 }\
1649 \
1650 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1651 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1652 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1653 src += 8*srcStride;\
1654 tmp += 8*tmpStride;\
1655 dst += 8*dstStride;\
1656 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1657 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1658 }\
1659
1660 #define H264_MC(OPNAME, SIZE) \
1661 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1662 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1663 }\
1664 \
1665 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1666 uint8_t half[SIZE*SIZE];\
1667 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1668 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
1669 }\
1670 \
1671 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1672 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1673 }\
1674 \
1675 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1676 uint8_t half[SIZE*SIZE];\
1677 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1678 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1679 }\
1680 \
1681 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1682 uint8_t full[SIZE*(SIZE+5)];\
1683 uint8_t * const full_mid= full + SIZE*2;\
1684 uint8_t half[SIZE*SIZE];\
1685 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1686 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1687 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1688 }\
1689 \
1690 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1691 uint8_t full[SIZE*(SIZE+5)];\
1692 uint8_t * const full_mid= full + SIZE*2;\
1693 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1694 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1695 }\
1696 \
1697 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1698 uint8_t full[SIZE*(SIZE+5)];\
1699 uint8_t * const full_mid= full + SIZE*2;\
1700 uint8_t half[SIZE*SIZE];\
1701 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1702 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1703 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1704 }\
1705 \
1706 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1707 uint8_t full[SIZE*(SIZE+5)];\
1708 uint8_t * const full_mid= full + SIZE*2;\
1709 uint8_t halfH[SIZE*SIZE];\
1710 uint8_t halfV[SIZE*SIZE];\
1711 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1712 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1713 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1714 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1715 }\
1716 \
1717 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1718 uint8_t full[SIZE*(SIZE+5)];\
1719 uint8_t * const full_mid= full + SIZE*2;\
1720 uint8_t halfH[SIZE*SIZE];\
1721 uint8_t halfV[SIZE*SIZE];\
1722 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1723 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1724 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1725 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1726 }\
1727 \
1728 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1729 uint8_t full[SIZE*(SIZE+5)];\
1730 uint8_t * const full_mid= full + SIZE*2;\
1731 uint8_t halfH[SIZE*SIZE];\
1732 uint8_t halfV[SIZE*SIZE];\
1733 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1734 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1735 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1736 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1737 }\
1738 \
1739 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1740 uint8_t full[SIZE*(SIZE+5)];\
1741 uint8_t * const full_mid= full + SIZE*2;\
1742 uint8_t halfH[SIZE*SIZE];\
1743 uint8_t halfV[SIZE*SIZE];\
1744 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1745 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1746 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1747 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1748 }\
1749 \
1750 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1751 int16_t tmp[SIZE*(SIZE+5)];\
1752 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1753 }\
1754 \
1755 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1756 int16_t tmp[SIZE*(SIZE+5)];\
1757 uint8_t halfH[SIZE*SIZE];\
1758 uint8_t halfHV[SIZE*SIZE];\
1759 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1760 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1761 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1762 }\
1763 \
1764 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1765 int16_t tmp[SIZE*(SIZE+5)];\
1766 uint8_t halfH[SIZE*SIZE];\
1767 uint8_t halfHV[SIZE*SIZE];\
1768 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1769 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1770 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1771 }\
1772 \
1773 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[SIZE*(SIZE+5)];\
1775 uint8_t * const full_mid= full + SIZE*2;\
1776 int16_t tmp[SIZE*(SIZE+5)];\
1777 uint8_t halfV[SIZE*SIZE];\
1778 uint8_t halfHV[SIZE*SIZE];\
1779 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1780 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1781 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1782 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1783 }\
1784 \
1785 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1786 uint8_t full[SIZE*(SIZE+5)];\
1787 uint8_t * const full_mid= full + SIZE*2;\
1788 int16_t tmp[SIZE*(SIZE+5)];\
1789 uint8_t halfV[SIZE*SIZE];\
1790 uint8_t halfHV[SIZE*SIZE];\
1791 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1792 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1793 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1794 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1795 }\
1796
1797 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1798 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
1799 #define op_put(a, b) a = cm[((b) + 16)>>5]
1800 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
1801 #define op2_put(a, b) a = cm[((b) + 512)>>10]
1802
1803 H264_LOWPASS(put_ , op_put, op2_put)
1804 H264_LOWPASS(avg_ , op_avg, op2_avg)
1805 H264_MC(put_, 4)
1806 H264_MC(put_, 8)
1807 H264_MC(put_, 16)
1808 H264_MC(avg_, 4)
1809 H264_MC(avg_, 8)
1810 H264_MC(avg_, 16)
1811
1812 #undef op_avg
1813 #undef op_put
1814 #undef op2_avg
1815 #undef op2_put
1816 #endif
1817
1818 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1819 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1820 int i;
1821
1822 for(i=0; i<h; i++){
1823 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1824 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1825 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1826 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1827 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1828 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1829 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1830 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1831 dst+=dstStride;
1832 src+=srcStride;
1833 }
1834 }
1835
1836 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1837 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1838 int i;
1839
1840 for(i=0; i<w; i++){
1841 const int src_1= src[ -srcStride];
1842 const int src0 = src[0 ];
1843 const int src1 = src[ srcStride];
1844 const int src2 = src[2*srcStride];
1845 const int src3 = src[3*srcStride];
1846 const int src4 = src[4*srcStride];
1847 const int src5 = src[5*srcStride];
1848 const int src6 = src[6*srcStride];
1849 const int src7 = src[7*srcStride];
1850 const int src8 = src[8*srcStride];
1851 const int src9 = src[9*srcStride];
1852 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1853 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1854 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1855 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1856 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1857 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1858 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1859 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1860 src++;
1861 dst++;
1862 }
1863 }
1864
1865 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1866 put_pixels8_c(dst, src, stride, 8);
1867 }
1868
1869 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1870 uint8_t half[64];
1871 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1872 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1873 }
1874
1875 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1876 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1877 }
1878
1879 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1880 uint8_t half[64];
1881 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1882 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1883 }
1884
1885 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1886 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1887 }
1888
1889 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1890 uint8_t halfH[88];
1891 uint8_t halfV[64];
1892 uint8_t halfHV[64];
1893 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1894 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1895 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1896 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1897 }
1898 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1899 uint8_t halfH[88];
1900 uint8_t halfV[64];
1901 uint8_t halfHV[64];
1902 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1903 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1904 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1905 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1906 }
1907 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1908 uint8_t halfH[88];
1909 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1910 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1911 }
1912
1913
1914 static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1915 {
1916 int s, i;
1917
1918 s = 0;
1919 for(i=0;i<16;i++) {
1920 s += abs(pix1[0] - pix2[0]);
1921 s += abs(pix1[1] - pix2[1]);
1922 s += abs(pix1[2] - pix2[2]);
1923 s += abs(pix1[3] - pix2[3]);
1924 s += abs(pix1[4] - pix2[4]);
1925 s += abs(pix1[5] - pix2[5]);
1926 s += abs(pix1[6] - pix2[6]);
1927 s += abs(pix1[7] - pix2[7]);
1928 s += abs(pix1[8] - pix2[8]);
1929 s += abs(pix1[9] - pix2[9]);
1930 s += abs(pix1[10] - pix2[10]);
1931 s += abs(pix1[11] - pix2[11]);
1932 s += abs(pix1[12] - pix2[12]);
1933 s += abs(pix1[13] - pix2[13]);
1934 s += abs(pix1[14] - pix2[14]);
1935 s += abs(pix1[15] - pix2[15]);
1936 pix1 += line_size;
1937 pix2 += line_size;
1938 }
1939 return s;
1940 }
1941
1942 static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1943 {
1944 int s, i;
1945
1946 s = 0;
1947 for(i=0;i<16;i++) {
1948 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1949 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1950 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1951 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1952 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1953 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1954 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1955 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1956 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1957 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1958 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1959 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1960 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1961 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1962 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1963 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1964 pix1 += line_size;
1965 pix2 += line_size;
1966 }
1967 return s;
1968 }
1969
1970 static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1971 {
1972 int s, i;
1973 uint8_t *pix3 = pix2 + line_size;
1974
1975 s = 0;
1976 for(i=0;i<16;i++) {
1977 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1978 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1979 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1980 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1981 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1982 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1983 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1984 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1985 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1986 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1987 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1988 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1989 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1990 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1991 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1992 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1993 pix1 += line_size;
1994 pix2 += line_size;
1995 pix3 += line_size;
1996 }
1997 return s;
1998 }
1999
2000 static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2001 {
2002 int s, i;
2003 uint8_t *pix3 = pix2 + line_size;
2004
2005 s = 0;
2006 for(i=0;i<16;i++) {
2007 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2008 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2009 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2010 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2011 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2012 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2013 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2014 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2015 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2016 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2017 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2018 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2019 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2020 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2021 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2022 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2023 pix1 += line_size;
2024 pix2 += line_size;
2025 pix3 += line_size;
2026 }
2027 return s;
2028 }
2029
2030 static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2031 {
2032 int s, i;
2033
2034 s = 0;
2035 for(i=0;i<8;i++) {
2036 s += abs(pix1[0] - pix2[0]);
2037 s += abs(pix1[1] - pix2[1]);
2038 s += abs(pix1[2] - pix2[2]);
2039 s += abs(pix1[3] - pix2[3]);
2040 s += abs(pix1[4] - pix2[4]);
2041 s += abs(pix1[5] - pix2[5]);
2042 s += abs(pix1[6] - pix2[6]);
2043 s += abs(pix1[7] - pix2[7]);
2044 pix1 += line_size;
2045 pix2 += line_size;
2046 }
2047 return s;
2048 }
2049
2050 static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2051 {
2052 int s, i;
2053
2054 s = 0;
2055 for(i=0;i<8;i++) {
2056 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2057 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2058 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2059 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2060 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2061 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2062 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2063 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2064 pix1 += line_size;
2065 pix2 += line_size;
2066 }
2067 return s;
2068 }
2069
2070 static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2071 {
2072 int s, i;
2073 uint8_t *pix3 = pix2 + line_size;
2074
2075 s = 0;
2076 for(i=0;i<8;i++) {
2077 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2078 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2079 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2080 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2081 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2082 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2083 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2084 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2085 pix1 += line_size;
2086 pix2 += line_size;
2087 pix3 += line_size;
2088 }
2089 return s;
2090 }
2091
2092 static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2093 {
2094 int s, i;
2095 uint8_t *pix3 = pix2 + line_size;
2096
2097 s = 0;
2098 for(i=0;i<8;i++) {
2099 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2100 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2101 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2102 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2103 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2104 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2105 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2106 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2107 pix1 += line_size;
2108 pix2 += line_size;
2109 pix3 += line_size;
2110 }
2111 return s;
2112 }
2113
2114 static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2115 return pix_abs16x16_c(a,b,stride);
2116 }
2117
2118 static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2119 return pix_abs8x8_c(a,b,stride);
2120 }
2121
2122 /**
2123 * permutes an 8x8 block.
2124 * @param block the block which will be permuted according to the given permutation vector
2125 * @param permutation the permutation vector
2126 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2127 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2128 * (inverse) permutated to scantable order!
2129 */
2130 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2131 {
2132 int i;
2133 DCTELEM temp[64];
2134
2135 if(last<=0) return;
2136 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2137
2138 for(i=0; i<=last; i++){
2139 const int j= scantable[i];
2140 temp[j]= block[j];
2141 block[j]=0;
2142 }
2143
2144 for(i=0; i<=last; i++){
2145 const int j= scantable[i];
2146 const int perm_j= permutation[j];
2147 block[perm_j]= temp[j];
2148 }
2149 }
2150
2151 /**
2152 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2153 */
2154 static void clear_blocks_c(DCTELEM *blocks)
2155 {
2156 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2157 }
2158
2159 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2160 int i;
2161 for(i=0; i+7<w; i+=8){
2162 dst[i+0] += src[i+0];
2163 dst[i+1] += src[i+1];
2164 dst[i+2] += src[i+2];
2165 dst[i+3] += src[i+3];
2166 dst[i+4] += src[i+4];
2167 dst[i+5] += src[i+5];
2168 dst[i+6] += src[i+6];
2169 dst[i+7] += src[i+7];
2170 }
2171 for(; i<w; i++)
2172 dst[i+0] += src[i+0];
2173 }
2174
2175 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2176 int i;
2177 for(i=0; i+7<w; i+=8){
2178 dst[i+0] = src1[i+0]-src2[i+0];
2179 dst[i+1] = src1[i+1]-src2[i+1];
2180 dst[i+2] = src1[i+2]-src2[i+2];
2181 dst[i+3] = src1[i+3]-src2[i+3];
2182 dst[i+4] = src1[i+4]-src2[i+4];
2183 dst[i+5] = src1[i+5]-src2[i+5];
2184 dst[i+6] = src1[i+6]-src2[i+6];
2185 dst[i+7] = src1[i+7]-src2[i+7];
2186 }
2187 for(; i<w; i++)
2188 dst[i+0] = src1[i+0]-src2[i+0];
2189 }
2190
2191 #define BUTTERFLY2(o1,o2,i1,i2) \
2192 o1= (i1)+(i2);\
2193 o2= (i1)-(i2);
2194
2195 #define BUTTERFLY1(x,y) \
2196 {\
2197 int a,b;\
2198 a= x;\
2199 b= y;\
2200 x= a+b;\
2201 y= a-b;\
2202 }
2203
2204 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2205
2206 static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2207 int i;
2208 int temp[64];
2209 int sum=0;
2210
2211 for(i=0; i<8; i++){
2212 //FIXME try pointer walks
2213 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2214 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2215 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2216 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2217
2218 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2219 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2220 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2221 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2222
2223 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2224 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2225 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2226 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2227 }
2228
2229 for(i=0; i<8; i++){
2230 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2231 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2232 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2233 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2234
2235 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2236 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2237 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2238 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2239
2240 sum +=
2241 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2242 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2243 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2244 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2245 }
2246 #if 0
2247 static int maxi=0;
2248 if(sum>maxi){
2249 maxi=sum;
2250 printf("MAX:%d\n", maxi);
2251 }
2252 #endif
2253 return sum;
2254 }
2255
2256 static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2257 int i;
2258 int temp[64];
2259 int sum=0;
2260 //FIXME OOOPS ignore 0 term instead of mean mess
2261 for(i=0; i<8; i++){
2262 //FIXME try pointer walks
2263 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2264 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2265 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2266 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2267
2268 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2269 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2270 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2271 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2272
2273 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2274 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2275 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2276 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2277 }
2278
2279 for(i=0; i<8; i++){
2280 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2281 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2282 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2283 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2284
2285 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2286 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2287 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2288 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2289
2290 sum +=
2291 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2292 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2293 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2294 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2295 }
2296
2297 return sum;
2298 }
2299
2300 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2301 MpegEncContext * const s= (MpegEncContext *)c;
2302 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2303 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2304 int sum=0, i;
2305
2306 s->dsp.diff_pixels(temp, src1, src2, stride);
2307 s->dsp.fdct(temp);
2308
2309 for(i=0; i<64; i++)
2310 sum+= ABS(temp[i]);
2311
2312 return sum;
2313 }
2314
2315 void simple_idct(DCTELEM *block); //FIXME
2316
2317 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2318 MpegEncContext * const s= (MpegEncContext *)c;
2319 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2320 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2321 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2322 int sum=0, i;
2323
2324 s->mb_intra=0;
2325
2326 s->dsp.diff_pixels(temp, src1, src2, stride);
2327
2328 memcpy(bak, temp, 64*sizeof(DCTELEM));
2329
2330 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2331 s->dct_unquantize(s, temp, 0, s->qscale);
2332 simple_idct(temp); //FIXME
2333
2334 for(i=0; i<64; i++)
2335 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2336
2337 return sum;
2338 }
2339
2340 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2341 MpegEncContext * const s= (MpegEncContext *)c;
2342 const uint8_t *scantable= s->intra_scantable.permutated;
2343 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2344 uint64_t __align8 aligned_bak[stride];
2345 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2346 uint8_t * const bak= (uint8_t*)aligned_bak;
2347 int i, last, run, bits, level, distoration, start_i;
2348 const int esc_length= s->ac_esc_length;
2349 uint8_t * length;
2350 uint8_t * last_length;
2351
2352 for(i=0; i<8; i++){
2353 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2354 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2355 }
2356
2357 s->dsp.diff_pixels(temp, src1, src2, stride);
2358
2359 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2360
2361 bits=0;
2362
2363 if (s->mb_intra) {
2364 start_i = 1;
2365 length = s->intra_ac_vlc_length;
2366 last_length= s->intra_ac_vlc_last_length;
2367 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2368 } else {
2369 start_i = 0;
2370 length = s->inter_ac_vlc_length;
2371 last_length= s->inter_ac_vlc_last_length;
2372 }
2373
2374 if(last>=start_i){
2375 run=0;
2376 for(i=start_i; i<last; i++){
2377 int j= scantable[i];
2378 level= temp[j];
2379
2380 if(level){
2381 level+=64;
2382 if((level&(~127)) == 0){
2383 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2384 }else
2385 bits+= esc_length;
2386 run=0;
2387 }else
2388 run++;
2389 }
2390 i= scantable[last];
2391
2392 level= temp[i] + 64;
2393
2394 assert(level - 64);
2395
2396 if((level&(~127)) == 0){
2397 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2398 }else
2399 bits+= esc_length;
2400
2401 }
2402
2403 if(last>=0){
2404 s->dct_unquantize(s, temp, 0, s->qscale);
2405 }
2406
2407 s->dsp.idct_add(bak, stride, temp);
2408
2409 distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2410
2411 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2412 }
2413
2414 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2415 MpegEncContext * const s= (MpegEncContext *)c;
2416 const uint8_t *scantable= s->intra_scantable.permutated;
2417 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2418 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2419 int i, last, run, bits, level, start_i;
2420 const int esc_length= s->ac_esc_length;
2421 uint8_t * length;
2422 uint8_t * last_length;
2423
2424 s->dsp.diff_pixels(temp, src1, src2, stride);
2425
2426 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2427
2428 bits=0;
2429
2430 if (s->mb_intra) {
2431 start_i = 1;
2432 length = s->intra_ac_vlc_length;
2433 last_length= s->intra_ac_vlc_last_length;
2434 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2435 } else {
2436 start_i = 0;
2437 length = s->inter_ac_vlc_length;
2438 last_length= s->inter_ac_vlc_last_length;
2439 }
2440
2441 if(last>=start_i){
2442 run=0;
2443 for(i=start_i; i<last; i++){
2444 int j= scantable[i];
2445 level= temp[j];
2446
2447 if(level){
2448 level+=64;
2449 if((level&(~127)) == 0){
2450 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2451 }else
2452 bits+= esc_length;
2453 run=0;
2454 }else
2455 run++;
2456 }
2457 i= scantable[last];
2458
2459 level= temp[i] + 64;
2460
2461 assert(level - 64);
2462
2463 if((level&(~127)) == 0){
2464 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2465 }else
2466 bits+= esc_length;
2467 }
2468
2469 return bits;
2470 }
2471
2472
2473 WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2474 WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2475 WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2476 WARPER88_1616(rd8x8_c, rd16x16_c)
2477 WARPER88_1616(bit8x8_c, bit16x16_c)
2478
2479 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2480 converted */
2481 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2482 {
2483 j_rev_dct (block);
2484 put_pixels_clamped_c(block, dest, line_size);
2485 }
2486 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2487 {
2488 j_rev_dct (block);
2489 add_pixels_clamped_c(block, dest, line_size);
2490 }
2491
2492 /* init static data */
2493 void dsputil_static_init(void)
2494 {
2495 int i;
2496
2497 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2498 for(i=0;i<MAX_NEG_CROP;i++) {
2499 cropTbl[i] = 0;
2500 cropTbl[i + MAX_NEG_CROP + 256] = 255;
2501 }
2502
2503 for(i=0;i<512;i++) {
2504 squareTbl[i] = (i - 256) * (i - 256);
2505 }
2506
2507 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2508 }
2509
2510
2511 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2512 {
2513 int i;
2514
2515 #ifdef CONFIG_ENCODERS
2516 if(avctx->dct_algo==FF_DCT_FASTINT)
2517 c->fdct = fdct_ifast;
2518 else
2519 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2520 #endif //CONFIG_ENCODERS
2521
2522 if(avctx->idct_algo==FF_IDCT_INT){
2523 c->idct_put= ff_jref_idct_put;
2524 c->idct_add= ff_jref_idct_add;
2525 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2526 }else{ //accurate/default
2527 c->idct_put= simple_idct_put;
2528 c->idct_add= simple_idct_add;
2529 c->idct_permutation_type= FF_NO_IDCT_PERM;
2530 }
2531
2532 c->get_pixels = get_pixels_c;
2533 c->diff_pixels = diff_pixels_c;
2534 c->put_pixels_clamped = put_pixels_clamped_c;
2535 c->add_pixels_clamped = add_pixels_clamped_c;
2536 c->gmc1 = gmc1_c;
2537 c->gmc = gmc_c;
2538 c->clear_blocks = clear_blocks_c;
2539 c->pix_sum = pix_sum_c;
2540 c->pix_norm1 = pix_norm1_c;
2541 c->sse[0]= sse16_c;
2542 c->sse[1]= sse8_c;
2543
2544 /* TODO [0] 16 [1] 8 */
2545 c->pix_abs16x16 = pix_abs16x16_c;
2546 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
2547 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
2548 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2549 c->pix_abs8x8 = pix_abs8x8_c;
2550 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
2551 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
2552 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2553
2554 #define dspfunc(PFX, IDX, NUM) \
2555 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
2556 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
2557 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
2558 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2559
2560 dspfunc(put, 0, 16);
2561 dspfunc(put_no_rnd, 0, 16);
2562 dspfunc(put, 1, 8);
2563 dspfunc(put_no_rnd, 1, 8);
2564
2565 dspfunc(avg, 0, 16);
2566 dspfunc(avg_no_rnd, 0, 16);
2567 dspfunc(avg, 1, 8);
2568 dspfunc(avg_no_rnd, 1, 8);
2569 #undef dspfunc
2570
2571 #define dspfunc(PFX, IDX, NUM) \
2572 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2573 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2574 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2575 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2576 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2577 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2578 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2579 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2580 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2581 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2582 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2583 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2584 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2585 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2586 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2587 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2588
2589 dspfunc(put_qpel, 0, 16);
2590 dspfunc(put_no_rnd_qpel, 0, 16);
2591
2592 dspfunc(avg_qpel, 0, 16);
2593 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2594
2595 dspfunc(put_qpel, 1, 8);
2596 dspfunc(put_no_rnd_qpel, 1, 8);
2597
2598 dspfunc(avg_qpel, 1, 8);
2599 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2600
2601 dspfunc(put_h264_qpel, 0, 16);
2602 dspfunc(put_h264_qpel, 1, 8);
2603 dspfunc(put_h264_qpel, 2, 4);
2604 dspfunc(avg_h264_qpel, 0, 16);
2605 dspfunc(avg_h264_qpel, 1, 8);
2606 dspfunc(avg_h264_qpel, 2, 4);
2607
2608 #undef dspfunc
2609 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
2610 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
2611 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
2612 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
2613 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
2614 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
2615
2616 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2617 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2618 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2619 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2620 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2621 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2622 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2623 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2624
2625 c->hadamard8_diff[0]= hadamard8_diff16_c;
2626 c->hadamard8_diff[1]= hadamard8_diff_c;
2627 c->hadamard8_abs = hadamard8_abs_c;
2628
2629 c->dct_sad[0]= dct_sad16x16_c;
2630 c->dct_sad[1]= dct_sad8x8_c;
2631
2632 c->sad[0]= sad16x16_c;
2633 c->sad[1]= sad8x8_c;
2634
2635 c->quant_psnr[0]= quant_psnr16x16_c;
2636 c->quant_psnr[1]= quant_psnr8x8_c;
2637
2638 c->rd[0]= rd16x16_c;
2639 c->rd[1]= rd8x8_c;
2640
2641 c->bit[0]= bit16x16_c;
2642 c->bit[1]= bit8x8_c;
2643
2644 c->add_bytes= add_bytes_c;
2645 c->diff_bytes= diff_bytes_c;
2646
2647 #ifdef HAVE_MMX
2648 dsputil_init_mmx(c, avctx);
2649 #endif
2650 #ifdef ARCH_ARMV4L
2651 dsputil_init_armv4l(c, avctx);
2652 #endif
2653 #ifdef HAVE_MLIB
2654 dsputil_init_mlib(c, avctx);
2655 #endif
2656 #ifdef ARCH_ALPHA
2657 dsputil_init_alpha(c, avctx);
2658 #endif
2659 #ifdef ARCH_POWERPC
2660 dsputil_init_ppc(c, avctx);
2661 #endif
2662 #ifdef HAVE_MMI
2663 dsputil_init_mmi(c, avctx);
2664 #endif
2665 #ifdef ARCH_SH4
2666 dsputil_init_sh4(c,avctx);
2667 #endif
2668
2669 switch(c->idct_permutation_type){
2670 case FF_NO_IDCT_PERM:
2671 for(i=0; i<64; i++)
2672 c->idct_permutation[i]= i;
2673 break;
2674 case FF_LIBMPEG2_IDCT_PERM:
2675 for(i=0; i<64; i++)
2676 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
2677 break;
2678 case FF_SIMPLE_IDCT_PERM:
2679 for(i=0; i<64; i++)
2680 c->idct_permutation[i]= simple_mmx_permutation[i];
2681 break;
2682 case FF_TRANSPOSE_IDCT_PERM:
2683 for(i=0; i<64; i++)
2684 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
2685 break;
2686 default:
2687 fprintf(stderr, "Internal error, IDCT permutation not set\n");
2688 }
2689 }
2690