h261
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
ff4ec49e
FB
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
de6d9b64 10 *
ff4ec49e 11 * This library is distributed in the hope that it will be useful,
de6d9b64 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
de6d9b64 15 *
ff4ec49e
FB
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 19 *
59fe111e 20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 21 */
983e3246
MN
22
23/**
24 * @file dsputil.c
25 * DSP utils
26 */
27
de6d9b64
FB
28#include "avcodec.h"
29#include "dsputil.h"
1457ab52 30#include "mpegvideo.h"
b0368839 31#include "simple_idct.h"
65e4c8c9 32#include "faandct.h"
5596c60c 33
8b69867f
MN
34uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
35uint32_t squareTbl[512] = {0, };
de6d9b64 36
0c1a9eda 37const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 40 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 41 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46};
47
10acc479
RS
48/* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59};
60
2f349de2 61/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
8b69867f 62uint16_t __align8 inv_zigzag_direct16[64] = {0, };
2f349de2 63
0c1a9eda 64const uint8_t ff_alternate_horizontal_scan[64] = {
2ad1516a 65 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73};
74
0c1a9eda 75const uint8_t ff_alternate_vertical_scan[64] = {
2ad1516a 76 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84};
85
2f349de2 86/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
0c1a9eda 87const uint32_t inverse[256]={
2f349de2
MN
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120};
121
b0368839
MN
122/* Input permutation for the simple_idct_mmx */
123static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132};
133
0c1a9eda 134static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
135{
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154}
155
0c1a9eda 156static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
157{
158 int s, i, j;
0c1a9eda 159 uint32_t *sq = squareTbl + 256;
3aa102be
MN
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
2a006cd3 164#if 0
3aa102be
MN
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
2a006cd3
FL
173#else
174#if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184#else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195#endif
196#endif
3aa102be
MN
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202}
203
3d2e8cce
MN
204static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220}
3aa102be 221
bb198e19 222static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
MN
223{
224 int s, i;
0c1a9eda 225 uint32_t *sq = squareTbl + 256;
1457ab52
MN
226
227 s = 0;
bb198e19 228 for (i = 0; i < h; i++) {
1457ab52
MN
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 s += sq[pix1[4] - pix2[4]];
234 s += sq[pix1[5] - pix2[5]];
235 s += sq[pix1[6] - pix2[6]];
236 s += sq[pix1[7] - pix2[7]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241}
242
bb198e19 243static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48 244{
6b026927
FH
245 int s, i;
246 uint32_t *sq = squareTbl + 256;
9c76bd48
BF
247
248 s = 0;
bb198e19 249 for (i = 0; i < h; i++) {
6b026927
FH
250 s += sq[pix1[ 0] - pix2[ 0]];
251 s += sq[pix1[ 1] - pix2[ 1]];
252 s += sq[pix1[ 2] - pix2[ 2]];
253 s += sq[pix1[ 3] - pix2[ 3]];
254 s += sq[pix1[ 4] - pix2[ 4]];
255 s += sq[pix1[ 5] - pix2[ 5]];
256 s += sq[pix1[ 6] - pix2[ 6]];
257 s += sq[pix1[ 7] - pix2[ 7]];
258 s += sq[pix1[ 8] - pix2[ 8]];
259 s += sq[pix1[ 9] - pix2[ 9]];
260 s += sq[pix1[10] - pix2[10]];
261 s += sq[pix1[11] - pix2[11]];
262 s += sq[pix1[12] - pix2[12]];
263 s += sq[pix1[13] - pix2[13]];
264 s += sq[pix1[14] - pix2[14]];
265 s += sq[pix1[15] - pix2[15]];
2a006cd3 266
6b026927
FH
267 pix1 += line_size;
268 pix2 += line_size;
9c76bd48
BF
269 }
270 return s;
271}
272
0c1a9eda 273static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 274{
de6d9b64
FB
275 int i;
276
277 /* read the pixels */
de6d9b64 278 for(i=0;i<8;i++) {
c13e1abd
FH
279 block[0] = pixels[0];
280 block[1] = pixels[1];
281 block[2] = pixels[2];
282 block[3] = pixels[3];
283 block[4] = pixels[4];
284 block[5] = pixels[5];
285 block[6] = pixels[6];
286 block[7] = pixels[7];
287 pixels += line_size;
288 block += 8;
de6d9b64
FB
289 }
290}
291
0c1a9eda
ZK
292static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293 const uint8_t *s2, int stride){
9dbcbd92
MN
294 int i;
295
296 /* read the pixels */
9dbcbd92 297 for(i=0;i<8;i++) {
c13e1abd
FH
298 block[0] = s1[0] - s2[0];
299 block[1] = s1[1] - s2[1];
300 block[2] = s1[2] - s2[2];
301 block[3] = s1[3] - s2[3];
302 block[4] = s1[4] - s2[4];
303 block[5] = s1[5] - s2[5];
304 block[6] = s1[6] - s2[6];
305 block[7] = s1[7] - s2[7];
9dbcbd92
MN
306 s1 += stride;
307 s2 += stride;
c13e1abd 308 block += 8;
9dbcbd92
MN
309 }
310}
311
312
0c1a9eda 313static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
eb4b3dd3 314 int line_size)
de6d9b64 315{
de6d9b64 316 int i;
0c1a9eda 317 uint8_t *cm = cropTbl + MAX_NEG_CROP;
de6d9b64
FB
318
319 /* read the pixels */
de6d9b64 320 for(i=0;i<8;i++) {
c13e1abd
FH
321 pixels[0] = cm[block[0]];
322 pixels[1] = cm[block[1]];
323 pixels[2] = cm[block[2]];
324 pixels[3] = cm[block[3]];
325 pixels[4] = cm[block[4]];
326 pixels[5] = cm[block[5]];
327 pixels[6] = cm[block[6]];
328 pixels[7] = cm[block[7]];
329
330 pixels += line_size;
331 block += 8;
de6d9b64
FB
332 }
333}
334
f9ed9d85
MM
335static void put_signed_pixels_clamped_c(const DCTELEM *block,
336 uint8_t *restrict pixels,
337 int line_size)
338{
339 int i, j;
340
341 for (i = 0; i < 8; i++) {
342 for (j = 0; j < 8; j++) {
343 if (*block < -128)
344 *pixels = 0;
345 else if (*block > 127)
346 *pixels = 255;
347 else
348 *pixels = (uint8_t)(*block + 128);
349 block++;
350 pixels++;
351 }
352 pixels += (line_size - 8);
353 }
354}
355
0c1a9eda 356static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 357 int line_size)
de6d9b64 358{
de6d9b64 359 int i;
0c1a9eda 360 uint8_t *cm = cropTbl + MAX_NEG_CROP;
de6d9b64
FB
361
362 /* read the pixels */
de6d9b64 363 for(i=0;i<8;i++) {
c13e1abd
FH
364 pixels[0] = cm[pixels[0] + block[0]];
365 pixels[1] = cm[pixels[1] + block[1]];
366 pixels[2] = cm[pixels[2] + block[2]];
367 pixels[3] = cm[pixels[3] + block[3]];
368 pixels[4] = cm[pixels[4] + block[4]];
369 pixels[5] = cm[pixels[5] + block[5]];
370 pixels[6] = cm[pixels[6] + block[6]];
371 pixels[7] = cm[pixels[7] + block[7]];
372 pixels += line_size;
373 block += 8;
de6d9b64
FB
374 }
375}
59fe111e
MN
376#if 0
377
378#define PIXOP2(OPNAME, OP) \
b3184779 379static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
380{\
381 int i;\
382 for(i=0; i<h; i++){\
383 OP(*((uint64_t*)block), LD64(pixels));\
384 pixels+=line_size;\
385 block +=line_size;\
386 }\
387}\
388\
45553457 389static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
390{\
391 int i;\
392 for(i=0; i<h; i++){\
393 const uint64_t a= LD64(pixels );\
394 const uint64_t b= LD64(pixels+1);\
395 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
396 pixels+=line_size;\
397 block +=line_size;\
398 }\
399}\
400\
45553457 401static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
402{\
403 int i;\
404 for(i=0; i<h; i++){\
405 const uint64_t a= LD64(pixels );\
406 const uint64_t b= LD64(pixels+1);\
407 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
408 pixels+=line_size;\
409 block +=line_size;\
410 }\
411}\
412\
45553457 413static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
414{\
415 int i;\
416 for(i=0; i<h; i++){\
417 const uint64_t a= LD64(pixels );\
418 const uint64_t b= LD64(pixels+line_size);\
419 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
420 pixels+=line_size;\
421 block +=line_size;\
422 }\
423}\
424\
45553457 425static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
426{\
427 int i;\
428 for(i=0; i<h; i++){\
429 const uint64_t a= LD64(pixels );\
430 const uint64_t b= LD64(pixels+line_size);\
431 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
432 pixels+=line_size;\
433 block +=line_size;\
434 }\
435}\
436\
45553457 437static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
438{\
439 int i;\
440 const uint64_t a= LD64(pixels );\
441 const uint64_t b= LD64(pixels+1);\
442 uint64_t l0= (a&0x0303030303030303ULL)\
443 + (b&0x0303030303030303ULL)\
444 + 0x0202020202020202ULL;\
445 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
446 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
447 uint64_t l1,h1;\
448\
449 pixels+=line_size;\
450 for(i=0; i<h; i+=2){\
451 uint64_t a= LD64(pixels );\
452 uint64_t b= LD64(pixels+1);\
453 l1= (a&0x0303030303030303ULL)\
454 + (b&0x0303030303030303ULL);\
455 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
456 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
457 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
458 pixels+=line_size;\
459 block +=line_size;\
460 a= LD64(pixels );\
461 b= LD64(pixels+1);\
462 l0= (a&0x0303030303030303ULL)\
463 + (b&0x0303030303030303ULL)\
464 + 0x0202020202020202ULL;\
465 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
466 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
467 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
468 pixels+=line_size;\
469 block +=line_size;\
470 }\
471}\
472\
45553457 473static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
474{\
475 int i;\
476 const uint64_t a= LD64(pixels );\
477 const uint64_t b= LD64(pixels+1);\
478 uint64_t l0= (a&0x0303030303030303ULL)\
479 + (b&0x0303030303030303ULL)\
480 + 0x0101010101010101ULL;\
481 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
482 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
483 uint64_t l1,h1;\
484\
485 pixels+=line_size;\
486 for(i=0; i<h; i+=2){\
487 uint64_t a= LD64(pixels );\
488 uint64_t b= LD64(pixels+1);\
489 l1= (a&0x0303030303030303ULL)\
490 + (b&0x0303030303030303ULL);\
491 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
492 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
493 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
494 pixels+=line_size;\
495 block +=line_size;\
496 a= LD64(pixels );\
497 b= LD64(pixels+1);\
498 l0= (a&0x0303030303030303ULL)\
499 + (b&0x0303030303030303ULL)\
500 + 0x0101010101010101ULL;\
501 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
502 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
503 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
504 pixels+=line_size;\
505 block +=line_size;\
506 }\
507}\
508\
45553457
ZK
509CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
510CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
511CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
512CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
513CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
514CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
515CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
516
517#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
518#else // 64 bit variant
519
520#define PIXOP2(OPNAME, OP) \
669ac79c
MN
521static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
522 int i;\
523 for(i=0; i<h; i++){\
524 OP(*((uint16_t*)(block )), LD16(pixels ));\
525 pixels+=line_size;\
526 block +=line_size;\
527 }\
528}\
0da71265
MN
529static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
530 int i;\
531 for(i=0; i<h; i++){\
532 OP(*((uint32_t*)(block )), LD32(pixels ));\
533 pixels+=line_size;\
534 block +=line_size;\
535 }\
536}\
45553457 537static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
538 int i;\
539 for(i=0; i<h; i++){\
540 OP(*((uint32_t*)(block )), LD32(pixels ));\
541 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
542 pixels+=line_size;\
543 block +=line_size;\
544 }\
545}\
45553457
ZK
546static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
547 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 548}\
59fe111e 549\
b3184779
MN
550static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
551 int src_stride1, int src_stride2, int h){\
59fe111e
MN
552 int i;\
553 for(i=0; i<h; i++){\
b3184779
MN
554 uint32_t a,b;\
555 a= LD32(&src1[i*src_stride1 ]);\
556 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 557 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
b3184779
MN
558 a= LD32(&src1[i*src_stride1+4]);\
559 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 560 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
59fe111e
MN
561 }\
562}\
563\
b3184779
MN
564static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
565 int src_stride1, int src_stride2, int h){\
59fe111e
MN
566 int i;\
567 for(i=0; i<h; i++){\
b3184779
MN
568 uint32_t a,b;\
569 a= LD32(&src1[i*src_stride1 ]);\
570 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 571 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
b3184779
MN
572 a= LD32(&src1[i*src_stride1+4]);\
573 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 574 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
59fe111e
MN
575 }\
576}\
577\
0da71265
MN
578static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579 int src_stride1, int src_stride2, int h){\
580 int i;\
581 for(i=0; i<h; i++){\
582 uint32_t a,b;\
583 a= LD32(&src1[i*src_stride1 ]);\
584 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 585 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
0da71265
MN
586 }\
587}\
588\
669ac79c
MN
589static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
590 int src_stride1, int src_stride2, int h){\
591 int i;\
592 for(i=0; i<h; i++){\
593 uint32_t a,b;\
594 a= LD16(&src1[i*src_stride1 ]);\
595 b= LD16(&src2[i*src_stride2 ]);\
596 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
597 }\
598}\
599\
b3184779
MN
600static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
601 int src_stride1, int src_stride2, int h){\
602 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
603 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
604}\
605\
606static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
607 int src_stride1, int src_stride2, int h){\
608 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
609 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
610}\
611\
45553457 612static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
613 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
614}\
615\
45553457 616static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
617 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
618}\
619\
45553457 620static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
621 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
622}\
623\
45553457 624static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
625 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
626}\
627\
628static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
629 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
630 int i;\
631 for(i=0; i<h; i++){\
b3184779
MN
632 uint32_t a, b, c, d, l0, l1, h0, h1;\
633 a= LD32(&src1[i*src_stride1]);\
634 b= LD32(&src2[i*src_stride2]);\
635 c= LD32(&src3[i*src_stride3]);\
636 d= LD32(&src4[i*src_stride4]);\
637 l0= (a&0x03030303UL)\
638 + (b&0x03030303UL)\
639 + 0x02020202UL;\
640 h0= ((a&0xFCFCFCFCUL)>>2)\
641 + ((b&0xFCFCFCFCUL)>>2);\
642 l1= (c&0x03030303UL)\
643 + (d&0x03030303UL);\
644 h1= ((c&0xFCFCFCFCUL)>>2)\
645 + ((d&0xFCFCFCFCUL)>>2);\
646 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
647 a= LD32(&src1[i*src_stride1+4]);\
648 b= LD32(&src2[i*src_stride2+4]);\
649 c= LD32(&src3[i*src_stride3+4]);\
650 d= LD32(&src4[i*src_stride4+4]);\
651 l0= (a&0x03030303UL)\
652 + (b&0x03030303UL)\
653 + 0x02020202UL;\
654 h0= ((a&0xFCFCFCFCUL)>>2)\
655 + ((b&0xFCFCFCFCUL)>>2);\
656 l1= (c&0x03030303UL)\
657 + (d&0x03030303UL);\
658 h1= ((c&0xFCFCFCFCUL)>>2)\
659 + ((d&0xFCFCFCFCUL)>>2);\
660 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
661 }\
662}\
669ac79c
MN
663\
664static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
665 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
666}\
667\
668static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
669 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
670}\
671\
672static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
673 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
674}\
675\
676static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
677 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
678}\
679\
b3184779
MN
680static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
682 int i;\
683 for(i=0; i<h; i++){\
b3184779
MN
684 uint32_t a, b, c, d, l0, l1, h0, h1;\
685 a= LD32(&src1[i*src_stride1]);\
686 b= LD32(&src2[i*src_stride2]);\
687 c= LD32(&src3[i*src_stride3]);\
688 d= LD32(&src4[i*src_stride4]);\
689 l0= (a&0x03030303UL)\
690 + (b&0x03030303UL)\
691 + 0x01010101UL;\
692 h0= ((a&0xFCFCFCFCUL)>>2)\
693 + ((b&0xFCFCFCFCUL)>>2);\
694 l1= (c&0x03030303UL)\
695 + (d&0x03030303UL);\
696 h1= ((c&0xFCFCFCFCUL)>>2)\
697 + ((d&0xFCFCFCFCUL)>>2);\
698 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
699 a= LD32(&src1[i*src_stride1+4]);\
700 b= LD32(&src2[i*src_stride2+4]);\
701 c= LD32(&src3[i*src_stride3+4]);\
702 d= LD32(&src4[i*src_stride4+4]);\
703 l0= (a&0x03030303UL)\
704 + (b&0x03030303UL)\
705 + 0x01010101UL;\
706 h0= ((a&0xFCFCFCFCUL)>>2)\
707 + ((b&0xFCFCFCFCUL)>>2);\
708 l1= (c&0x03030303UL)\
709 + (d&0x03030303UL);\
710 h1= ((c&0xFCFCFCFCUL)>>2)\
711 + ((d&0xFCFCFCFCUL)>>2);\
712 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
713 }\
714}\
b3184779
MN
715static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
716 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
717 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
718 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
719}\
720static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
721 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
722 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
723 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
724}\
59fe111e 725\
669ac79c
MN
726static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
727{\
728 int i, a0, b0, a1, b1;\
729 a0= pixels[0];\
730 b0= pixels[1] + 2;\
731 a0 += b0;\
732 b0 += pixels[2];\
733\
734 pixels+=line_size;\
735 for(i=0; i<h; i+=2){\
736 a1= pixels[0];\
737 b1= pixels[1];\
738 a1 += b1;\
739 b1 += pixels[2];\
740\
741 block[0]= (a1+a0)>>2; /* FIXME non put */\
742 block[1]= (b1+b0)>>2;\
743\
744 pixels+=line_size;\
745 block +=line_size;\
746\
747 a0= pixels[0];\
748 b0= pixels[1] + 2;\
749 a0 += b0;\
750 b0 += pixels[2];\
751\
752 block[0]= (a1+a0)>>2;\
753 block[1]= (b1+b0)>>2;\
754 pixels+=line_size;\
755 block +=line_size;\
756 }\
757}\
758\
759static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
760{\
761 int i;\
762 const uint32_t a= LD32(pixels );\
763 const uint32_t b= LD32(pixels+1);\
764 uint32_t l0= (a&0x03030303UL)\
765 + (b&0x03030303UL)\
766 + 0x02020202UL;\
767 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
768 + ((b&0xFCFCFCFCUL)>>2);\
769 uint32_t l1,h1;\
770\
771 pixels+=line_size;\
772 for(i=0; i<h; i+=2){\
773 uint32_t a= LD32(pixels );\
774 uint32_t b= LD32(pixels+1);\
775 l1= (a&0x03030303UL)\
776 + (b&0x03030303UL);\
777 h1= ((a&0xFCFCFCFCUL)>>2)\
778 + ((b&0xFCFCFCFCUL)>>2);\
779 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
780 pixels+=line_size;\
781 block +=line_size;\
782 a= LD32(pixels );\
783 b= LD32(pixels+1);\
784 l0= (a&0x03030303UL)\
785 + (b&0x03030303UL)\
786 + 0x02020202UL;\
787 h0= ((a&0xFCFCFCFCUL)>>2)\
788 + ((b&0xFCFCFCFCUL)>>2);\
789 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
790 pixels+=line_size;\
791 block +=line_size;\
792 }\
793}\
794\
45553457 795static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
796{\
797 int j;\
798 for(j=0; j<2; j++){\
799 int i;\
800 const uint32_t a= LD32(pixels );\
801 const uint32_t b= LD32(pixels+1);\
802 uint32_t l0= (a&0x03030303UL)\
803 + (b&0x03030303UL)\
804 + 0x02020202UL;\
805 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
806 + ((b&0xFCFCFCFCUL)>>2);\
807 uint32_t l1,h1;\
808\
809 pixels+=line_size;\
810 for(i=0; i<h; i+=2){\
811 uint32_t a= LD32(pixels );\
812 uint32_t b= LD32(pixels+1);\
813 l1= (a&0x03030303UL)\
814 + (b&0x03030303UL);\
815 h1= ((a&0xFCFCFCFCUL)>>2)\
816 + ((b&0xFCFCFCFCUL)>>2);\
817 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
818 pixels+=line_size;\
819 block +=line_size;\
820 a= LD32(pixels );\
821 b= LD32(pixels+1);\
822 l0= (a&0x03030303UL)\
823 + (b&0x03030303UL)\
824 + 0x02020202UL;\
825 h0= ((a&0xFCFCFCFCUL)>>2)\
826 + ((b&0xFCFCFCFCUL)>>2);\
827 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
828 pixels+=line_size;\
829 block +=line_size;\
830 }\
831 pixels+=4-line_size*(h+1);\
832 block +=4-line_size*h;\
833 }\
834}\
835\
45553457 836static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
837{\
838 int j;\
839 for(j=0; j<2; j++){\
840 int i;\
841 const uint32_t a= LD32(pixels );\
842 const uint32_t b= LD32(pixels+1);\
843 uint32_t l0= (a&0x03030303UL)\
844 + (b&0x03030303UL)\
845 + 0x01010101UL;\
846 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
847 + ((b&0xFCFCFCFCUL)>>2);\
848 uint32_t l1,h1;\
849\
850 pixels+=line_size;\
851 for(i=0; i<h; i+=2){\
852 uint32_t a= LD32(pixels );\
853 uint32_t b= LD32(pixels+1);\
854 l1= (a&0x03030303UL)\
855 + (b&0x03030303UL);\
856 h1= ((a&0xFCFCFCFCUL)>>2)\
857 + ((b&0xFCFCFCFCUL)>>2);\
858 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
859 pixels+=line_size;\
860 block +=line_size;\
861 a= LD32(pixels );\
862 b= LD32(pixels+1);\
863 l0= (a&0x03030303UL)\
864 + (b&0x03030303UL)\
865 + 0x01010101UL;\
866 h0= ((a&0xFCFCFCFCUL)>>2)\
867 + ((b&0xFCFCFCFCUL)>>2);\
868 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
869 pixels+=line_size;\
870 block +=line_size;\
871 }\
872 pixels+=4-line_size*(h+1);\
873 block +=4-line_size*h;\
874 }\
875}\
876\
45553457
ZK
877CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
878CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
879CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
880CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
881CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
882CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
883CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
884CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 885
d8085ea7 886#define op_avg(a, b) a = rnd_avg32(a, b)
59fe111e 887#endif
59fe111e
MN
888#define op_put(a, b) a = b
889
890PIXOP2(avg, op_avg)
891PIXOP2(put, op_put)
892#undef op_avg
893#undef op_put
894
de6d9b64
FB
895#define avg2(a,b) ((a+b+1)>>1)
896#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
897
c0a0170c
MN
898static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
899 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
900}
901
902static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
903 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
904}
073b013d 905
0c1a9eda 906static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
907{
908 const int A=(16-x16)*(16-y16);
909 const int B=( x16)*(16-y16);
910 const int C=(16-x16)*( y16);
911 const int D=( x16)*( y16);
912 int i;
44eb4951
MN
913
914 for(i=0; i<h; i++)
915 {
b3184779
MN
916 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
917 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
918 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
919 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
920 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
921 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
922 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
923 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
924 dst+= stride;
925 src+= stride;
44eb4951
MN
926 }
927}
928
0c1a9eda 929static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
930 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
931{
932 int y, vx, vy;
933 const int s= 1<<shift;
934
935 width--;
936 height--;
937
938 for(y=0; y<h; y++){
939 int x;
940
941 vx= ox;
942 vy= oy;
943 for(x=0; x<8; x++){ //XXX FIXME optimize
944 int src_x, src_y, frac_x, frac_y, index;
945
946 src_x= vx>>16;
947 src_y= vy>>16;
948 frac_x= src_x&(s-1);
949 frac_y= src_y&(s-1);
950 src_x>>=shift;
951 src_y>>=shift;
952
953 if((unsigned)src_x < width){
954 if((unsigned)src_y < height){
955 index= src_x + src_y*stride;
956 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
957 + src[index +1]* frac_x )*(s-frac_y)
958 + ( src[index+stride ]*(s-frac_x)
959 + src[index+stride+1]* frac_x )* frac_y
960 + r)>>(shift*2);
961 }else{
962 index= src_x + clip(src_y, 0, height)*stride;
963 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
964 + src[index +1]* frac_x )*s
965 + r)>>(shift*2);
966 }
967 }else{
968 if((unsigned)src_y < height){
969 index= clip(src_x, 0, width) + src_y*stride;
970 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
971 + src[index+stride ]* frac_y )*s
972 + r)>>(shift*2);
973 }else{
974 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
975 dst[y*stride + x]= src[index ];
976 }
977 }
978
979 vx+= dxx;
980 vy+= dyx;
981 }
982 ox += dxy;
983 oy += dyy;
984 }
985}
669ac79c
MN
986
987static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
988 switch(width){
989 case 2: put_pixels2_c (dst, src, stride, height); break;
990 case 4: put_pixels4_c (dst, src, stride, height); break;
991 case 8: put_pixels8_c (dst, src, stride, height); break;
992 case 16:put_pixels16_c(dst, src, stride, height); break;
993 }
994}
995
996static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
997 int i,j;
998 for (i=0; i < height; i++) {
999 for (j=0; j < width; j++) {
1000 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1001 }
1002 src += stride;
1003 dst += stride;
1004 }
1005}
1006
1007static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1008 int i,j;
1009 for (i=0; i < height; i++) {
1010 for (j=0; j < width; j++) {
1011 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1012 }
1013 src += stride;
1014 dst += stride;
1015 }
1016}
1017
1018static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1019 int i,j;
1020 for (i=0; i < height; i++) {
1021 for (j=0; j < width; j++) {
1022 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1023 }
1024 src += stride;
1025 dst += stride;
1026 }
1027}
1028
1029static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1030 int i,j;
1031 for (i=0; i < height; i++) {
1032 for (j=0; j < width; j++) {
1033 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1034 }
1035 src += stride;
1036 dst += stride;
1037 }
1038}
1039
1040static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1041 int i,j;
1042 for (i=0; i < height; i++) {
1043 for (j=0; j < width; j++) {
89ebf4e8 1044 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1045 }
1046 src += stride;
1047 dst += stride;
1048 }
1049}
1050
1051static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1052 int i,j;
1053 for (i=0; i < height; i++) {
1054 for (j=0; j < width; j++) {
1055 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1056 }
1057 src += stride;
1058 dst += stride;
1059 }
1060}
1061
1062static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1063 int i,j;
1064 for (i=0; i < height; i++) {
1065 for (j=0; j < width; j++) {
89ebf4e8 1066 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1067 }
1068 src += stride;
1069 dst += stride;
1070 }
1071}
1072
1073static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1074 int i,j;
1075 for (i=0; i < height; i++) {
1076 for (j=0; j < width; j++) {
1077 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1078 }
1079 src += stride;
1080 dst += stride;
1081 }
1082}
da3b9756
MM
1083
1084static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1085 switch(width){
1086 case 2: avg_pixels2_c (dst, src, stride, height); break;
1087 case 4: avg_pixels4_c (dst, src, stride, height); break;
1088 case 8: avg_pixels8_c (dst, src, stride, height); break;
1089 case 16:avg_pixels16_c(dst, src, stride, height); break;
1090 }
1091}
1092
1093static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1094 int i,j;
1095 for (i=0; i < height; i++) {
1096 for (j=0; j < width; j++) {
1097 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1098 }
1099 src += stride;
1100 dst += stride;
1101 }
1102}
1103
1104static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1105 int i,j;
1106 for (i=0; i < height; i++) {
1107 for (j=0; j < width; j++) {
1108 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1109 }
1110 src += stride;
1111 dst += stride;
1112 }
1113}
1114
1115static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1116 int i,j;
1117 for (i=0; i < height; i++) {
1118 for (j=0; j < width; j++) {
1119 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1120 }
1121 src += stride;
1122 dst += stride;
1123 }
1124}
1125
1126static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1127 int i,j;
1128 for (i=0; i < height; i++) {
1129 for (j=0; j < width; j++) {
1130 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1131 }
1132 src += stride;
1133 dst += stride;
1134 }
1135}
1136
1137static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1138 int i,j;
1139 for (i=0; i < height; i++) {
1140 for (j=0; j < width; j++) {
89ebf4e8 1141 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1142 }
1143 src += stride;
1144 dst += stride;
1145 }
1146}
1147
1148static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1149 int i,j;
1150 for (i=0; i < height; i++) {
1151 for (j=0; j < width; j++) {
1152 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1153 }
1154 src += stride;
1155 dst += stride;
1156 }
1157}
1158
1159static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1160 int i,j;
1161 for (i=0; i < height; i++) {
1162 for (j=0; j < width; j++) {
89ebf4e8 1163 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1164 }
1165 src += stride;
1166 dst += stride;
1167 }
1168}
1169
1170static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1171 int i,j;
1172 for (i=0; i < height; i++) {
1173 for (j=0; j < width; j++) {
1174 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1175 }
1176 src += stride;
1177 dst += stride;
1178 }
1179}
669ac79c
MN
1180#if 0
1181#define TPEL_WIDTH(width)\
1182static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1183 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1184static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1185 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1186static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1187 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1188static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1189 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1190static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1191 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1192static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1193 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1194static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1195 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1196static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1197 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1198static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1199 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1200#endif
1201
0da71265
MN
1202#define H264_CHROMA_MC(OPNAME, OP)\
1203static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1204 const int A=(8-x)*(8-y);\
1205 const int B=( x)*(8-y);\
1206 const int C=(8-x)*( y);\
1207 const int D=( x)*( y);\
1208 int i;\
1209 \
1210 assert(x<8 && y<8 && x>=0 && y>=0);\
1211\
1212 for(i=0; i<h; i++)\
1213 {\
1214 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1215 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1216 dst+= stride;\
1217 src+= stride;\
1218 }\
1219}\
1220\
1221static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1222 const int A=(8-x)*(8-y);\
1223 const int B=( x)*(8-y);\
1224 const int C=(8-x)*( y);\
1225 const int D=( x)*( y);\
1226 int i;\
1227 \
1228 assert(x<8 && y<8 && x>=0 && y>=0);\
1229\
1230 for(i=0; i<h; i++)\
1231 {\
1232 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1233 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1234 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1235 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1236 dst+= stride;\
1237 src+= stride;\
1238 }\
1239}\
1240\
1241static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1242 const int A=(8-x)*(8-y);\
1243 const int B=( x)*(8-y);\
1244 const int C=(8-x)*( y);\
1245 const int D=( x)*( y);\
1246 int i;\
1247 \
1248 assert(x<8 && y<8 && x>=0 && y>=0);\
1249\
1250 for(i=0; i<h; i++)\
1251 {\
1252 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1253 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1254 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1255 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1256 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1257 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1258 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1259 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1260 dst+= stride;\
1261 src+= stride;\
1262 }\
1263}
1264
1265#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1266#define op_put(a, b) a = (((b) + 32)>>6)
1267
1268H264_CHROMA_MC(put_ , op_put)
1269H264_CHROMA_MC(avg_ , op_avg)
1270#undef op_avg
1271#undef op_put
1272
1273static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1274{
1275 int i;
1276 for(i=0; i<h; i++)
1277 {
1278 ST32(dst , LD32(src ));
1279 dst+=dstStride;
1280 src+=srcStride;
1281 }
1282}
1283
1284static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1285{
1286 int i;
1287 for(i=0; i<h; i++)
1288 {
1289 ST32(dst , LD32(src ));
1290 ST32(dst+4 , LD32(src+4 ));
1291 dst+=dstStride;
1292 src+=srcStride;
1293 }
1294}
1295
1296static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297{
1298 int i;
1299 for(i=0; i<h; i++)
1300 {
1301 ST32(dst , LD32(src ));
1302 ST32(dst+4 , LD32(src+4 ));
1303 ST32(dst+8 , LD32(src+8 ));
1304 ST32(dst+12, LD32(src+12));
1305 dst+=dstStride;
1306 src+=srcStride;
1307 }
1308}
073b013d 1309
0c1a9eda 1310static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951 1311{
44eb4951
MN
1312 int i;
1313 for(i=0; i<h; i++)
1314 {
b3184779
MN
1315 ST32(dst , LD32(src ));
1316 ST32(dst+4 , LD32(src+4 ));
1317 ST32(dst+8 , LD32(src+8 ));
1318 ST32(dst+12, LD32(src+12));
1319 dst[16]= src[16];
44eb4951
MN
1320 dst+=dstStride;
1321 src+=srcStride;
1322 }
1323}
1324
0c1a9eda 1325static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951
MN
1326{
1327 int i;
b3184779 1328 for(i=0; i<h; i++)
44eb4951 1329 {
b3184779
MN
1330 ST32(dst , LD32(src ));
1331 ST32(dst+4 , LD32(src+4 ));
1332 dst[8]= src[8];
44eb4951
MN
1333 dst+=dstStride;
1334 src+=srcStride;
1335 }
1336}
1337
826f429a 1338
b3184779 1339#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda
ZK
1340static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1341 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1342 int i;\
1343 for(i=0; i<h; i++)\
1344 {\
1345 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1346 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1347 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1348 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1349 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1350 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1351 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1352 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1353 dst+=dstStride;\
1354 src+=srcStride;\
1355 }\
44eb4951
MN
1356}\
1357\
0c1a9eda 1358static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 1359 const int w=8;\
0c1a9eda 1360 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1361 int i;\
1362 for(i=0; i<w; i++)\
1363 {\
1364 const int src0= src[0*srcStride];\
1365 const int src1= src[1*srcStride];\
1366 const int src2= src[2*srcStride];\
1367 const int src3= src[3*srcStride];\
1368 const int src4= src[4*srcStride];\
1369 const int src5= src[5*srcStride];\
1370 const int src6= src[6*srcStride];\
1371 const int src7= src[7*srcStride];\
1372 const int src8= src[8*srcStride];\
1373 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1374 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1375 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1376 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1377 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1378 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1379 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1380 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1381 dst++;\
1382 src++;\
1383 }\
1384}\
1385\
0c1a9eda
ZK
1386static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1387 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1388 int i;\
826f429a 1389 \
b3184779
MN
1390 for(i=0; i<h; i++)\
1391 {\
1392 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1393 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1394 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1395 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1396 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1397 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1398 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1399 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1400 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1401 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1402 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1403 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1404 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1405 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1406 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1407 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1408 dst+=dstStride;\
1409 src+=srcStride;\
1410 }\
1411}\
1412\
0c1a9eda
ZK
1413static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1414 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1415 int i;\
826f429a 1416 const int w=16;\
b3184779
MN
1417 for(i=0; i<w; i++)\
1418 {\
1419 const int src0= src[0*srcStride];\
1420 const int src1= src[1*srcStride];\
1421 const int src2= src[2*srcStride];\
1422 const int src3= src[3*srcStride];\
1423 const int src4= src[4*srcStride];\
1424 const int src5= src[5*srcStride];\
1425 const int src6= src[6*srcStride];\
1426 const int src7= src[7*srcStride];\
1427 const int src8= src[8*srcStride];\
1428 const int src9= src[9*srcStride];\
1429 const int src10= src[10*srcStride];\
1430 const int src11= src[11*srcStride];\
1431 const int src12= src[12*srcStride];\
1432 const int src13= src[13*srcStride];\
1433 const int src14= src[14*srcStride];\
1434 const int src15= src[15*srcStride];\
1435 const int src16= src[16*srcStride];\
1436 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1437 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1438 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1439 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1440 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1441 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1442 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1443 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1444 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1445 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1446 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1447 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1448 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1449 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1450 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1451 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1452 dst++;\
1453 src++;\
1454 }\
1455}\
1456\
0c1a9eda 1457static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1458 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1459}\
1460\
0c1a9eda
ZK
1461static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1462 uint8_t half[64];\
b3184779
MN
1463 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1464 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1465}\
1466\
0c1a9eda 1467static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1468 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1469}\
1470\
0c1a9eda
ZK
1471static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1472 uint8_t half[64];\
b3184779
MN
1473 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1474 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1475}\
1476\
0c1a9eda
ZK
1477static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1478 uint8_t full[16*9];\
1479 uint8_t half[64];\
b3184779 1480 copy_block9(full, src, 16, stride, 9);\
db794953 1481 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1482 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1483}\
1484\
0c1a9eda
ZK
1485static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1486 uint8_t full[16*9];\
b3184779 1487 copy_block9(full, src, 16, stride, 9);\
db794953 1488 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1489}\
1490\
0c1a9eda
ZK
1491static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1492 uint8_t full[16*9];\
1493 uint8_t half[64];\
b3184779 1494 copy_block9(full, src, 16, stride, 9);\
db794953 1495 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1496 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1497}\
0c1a9eda
ZK
1498void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[16*9];\
1500 uint8_t halfH[72];\
1501 uint8_t halfV[64];\
1502 uint8_t halfHV[64];\
b3184779
MN
1503 copy_block9(full, src, 16, stride, 9);\
1504 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1505 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1507 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1508}\
0c1a9eda
ZK
1509static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1510 uint8_t full[16*9];\
1511 uint8_t halfH[72];\
1512 uint8_t halfHV[64];\
db794953
MN
1513 copy_block9(full, src, 16, stride, 9);\
1514 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1518}\
0c1a9eda
ZK
1519void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520 uint8_t full[16*9];\
1521 uint8_t halfH[72];\
1522 uint8_t halfV[64];\
1523 uint8_t halfHV[64];\
b3184779
MN
1524 copy_block9(full, src, 16, stride, 9);\
1525 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1526 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1528 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1529}\
0c1a9eda
ZK
1530static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1531 uint8_t full[16*9];\
1532 uint8_t halfH[72];\
1533 uint8_t halfHV[64];\
db794953
MN
1534 copy_block9(full, src, 16, stride, 9);\
1535 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1539}\
0c1a9eda
ZK
1540void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541 uint8_t full[16*9];\
1542 uint8_t halfH[72];\
1543 uint8_t halfV[64];\
1544 uint8_t halfHV[64];\
b3184779
MN
1545 copy_block9(full, src, 16, stride, 9);\
1546 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1547 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1548 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1549 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1550}\
0c1a9eda
ZK
1551static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1552 uint8_t full[16*9];\
1553 uint8_t halfH[72];\
1554 uint8_t halfHV[64];\
db794953
MN
1555 copy_block9(full, src, 16, stride, 9);\
1556 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1558 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1560}\
0c1a9eda
ZK
1561void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1562 uint8_t full[16*9];\
1563 uint8_t halfH[72];\
1564 uint8_t halfV[64];\
1565 uint8_t halfHV[64];\
b3184779
MN
1566 copy_block9(full, src, 16, stride, 9);\
1567 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1568 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1569 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1570 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1571}\
0c1a9eda
ZK
1572static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1573 uint8_t full[16*9];\
1574 uint8_t halfH[72];\
1575 uint8_t halfHV[64];\
db794953
MN
1576 copy_block9(full, src, 16, stride, 9);\
1577 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1578 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1579 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1580 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1581}\
0c1a9eda
ZK
1582static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1583 uint8_t halfH[72];\
1584 uint8_t halfHV[64];\
b3184779 1585 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1586 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1587 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1588}\
0c1a9eda
ZK
1589static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1590 uint8_t halfH[72];\
1591 uint8_t halfHV[64];\
b3184779 1592 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1593 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1594 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1595}\
0c1a9eda
ZK
1596void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1597 uint8_t full[16*9];\
1598 uint8_t halfH[72];\
1599 uint8_t halfV[64];\
1600 uint8_t halfHV[64];\
b3184779
MN
1601 copy_block9(full, src, 16, stride, 9);\
1602 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1603 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1604 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1605 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1606}\
0c1a9eda
ZK
1607static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1608 uint8_t full[16*9];\
1609 uint8_t halfH[72];\
db794953
MN
1610 copy_block9(full, src, 16, stride, 9);\
1611 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1612 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1613 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1614}\
0c1a9eda
ZK
1615void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1616 uint8_t full[16*9];\
1617 uint8_t halfH[72];\
1618 uint8_t halfV[64];\
1619 uint8_t halfHV[64];\
b3184779
MN
1620 copy_block9(full, src, 16, stride, 9);\
1621 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1622 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1623 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1624 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1625}\
0c1a9eda
ZK
1626static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1627 uint8_t full[16*9];\
1628 uint8_t halfH[72];\
db794953
MN
1629 copy_block9(full, src, 16, stride, 9);\
1630 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1631 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1632 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1633}\
0c1a9eda
ZK
1634static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1635 uint8_t halfH[72];\
b3184779 1636 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1637 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 1638}\
0c1a9eda 1639static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1640 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1641}\
1642\
0c1a9eda
ZK
1643static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1644 uint8_t half[256];\
b3184779
MN
1645 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1646 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1647}\
1648\
0c1a9eda 1649static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1650 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1651}\
b3184779 1652\
0c1a9eda
ZK
1653static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1654 uint8_t half[256];\
b3184779
MN
1655 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1656 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1657}\
1658\
0c1a9eda
ZK
1659static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1660 uint8_t full[24*17];\
1661 uint8_t half[256];\
b3184779 1662 copy_block17(full, src, 24, stride, 17);\
826f429a 1663 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1664 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1665}\
1666\
0c1a9eda
ZK
1667static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1668 uint8_t full[24*17];\
b3184779 1669 copy_block17(full, src, 24, stride, 17);\
826f429a 1670 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1671}\
1672\
0c1a9eda
ZK
1673static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1674 uint8_t full[24*17];\
1675 uint8_t half[256];\
b3184779 1676 copy_block17(full, src, 24, stride, 17);\
826f429a 1677 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1678 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1679}\
0c1a9eda
ZK
1680void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[24*17];\
1682 uint8_t halfH[272];\
1683 uint8_t halfV[256];\
1684 uint8_t halfHV[256];\
b3184779
MN
1685 copy_block17(full, src, 24, stride, 17);\
1686 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1687 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1689 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690}\
0c1a9eda
ZK
1691static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1692 uint8_t full[24*17];\
1693 uint8_t halfH[272];\
1694 uint8_t halfHV[256];\
db794953
MN
1695 copy_block17(full, src, 24, stride, 17);\
1696 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1700}\
0c1a9eda
ZK
1701void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702 uint8_t full[24*17];\
1703 uint8_t halfH[272];\
1704 uint8_t halfV[256];\
1705 uint8_t halfHV[256];\
b3184779
MN
1706 copy_block17(full, src, 24, stride, 17);\
1707 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1708 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1710 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711}\
0c1a9eda
ZK
1712static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[24*17];\
1714 uint8_t halfH[272];\
1715 uint8_t halfHV[256];\
db794953
MN
1716 copy_block17(full, src, 24, stride, 17);\
1717 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1721}\
0c1a9eda
ZK
1722void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723 uint8_t full[24*17];\
1724 uint8_t halfH[272];\
1725 uint8_t halfV[256];\
1726 uint8_t halfHV[256];\
b3184779
MN
1727 copy_block17(full, src, 24, stride, 17);\
1728 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1729 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1730 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1731 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1732}\
0c1a9eda
ZK
1733static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1734 uint8_t full[24*17];\
1735 uint8_t halfH[272];\
1736 uint8_t halfHV[256];\
db794953
MN
1737 copy_block17(full, src, 24, stride, 17);\
1738 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1740 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1742}\
0c1a9eda
ZK
1743void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t full[24*17];\
1745 uint8_t halfH[272];\
1746 uint8_t halfV[256];\
1747 uint8_t halfHV[256];\
b3184779
MN
1748 copy_block17(full, src, 24, stride, 17);\
1749 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1750 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1751 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1752 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1753}\
0c1a9eda
ZK
1754static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1755 uint8_t full[24*17];\
1756 uint8_t halfH[272];\
1757 uint8_t halfHV[256];\
db794953
MN
1758 copy_block17(full, src, 24, stride, 17);\
1759 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1760 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1761 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1762 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1763}\
0c1a9eda
ZK
1764static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1765 uint8_t halfH[272];\
1766 uint8_t halfHV[256];\
b3184779 1767 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1768 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1769 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1770}\
0c1a9eda
ZK
1771static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1772 uint8_t halfH[272];\
1773 uint8_t halfHV[256];\
b3184779 1774 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1775 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1776 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1777}\
0c1a9eda
ZK
1778void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t full[24*17];\
1780 uint8_t halfH[272];\
1781 uint8_t halfV[256];\
1782 uint8_t halfHV[256];\
b3184779
MN
1783 copy_block17(full, src, 24, stride, 17);\
1784 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1785 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1786 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1787 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1788}\
0c1a9eda
ZK
1789static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1790 uint8_t full[24*17];\
1791 uint8_t halfH[272];\
db794953
MN
1792 copy_block17(full, src, 24, stride, 17);\
1793 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1794 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1795 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1796}\
0c1a9eda
ZK
1797void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1798 uint8_t full[24*17];\
1799 uint8_t halfH[272];\
1800 uint8_t halfV[256];\
1801 uint8_t halfHV[256];\
b3184779
MN
1802 copy_block17(full, src, 24, stride, 17);\
1803 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1804 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1805 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1806 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1807}\
0c1a9eda
ZK
1808static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1809 uint8_t full[24*17];\
1810 uint8_t halfH[272];\
db794953
MN
1811 copy_block17(full, src, 24, stride, 17);\
1812 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1813 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1814 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1815}\
0c1a9eda
ZK
1816static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1817 uint8_t halfH[272];\
b3184779 1818 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1819 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 1820}
44eb4951 1821
b3184779
MN
1822#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1823#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1824#define op_put(a, b) a = cm[((b) + 16)>>5]
1825#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1826
1827QPEL_MC(0, put_ , _ , op_put)
1828QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1829QPEL_MC(0, avg_ , _ , op_avg)
1830//QPEL_MC(1, avg_no_rnd , _ , op_avg)
1831#undef op_avg
1832#undef op_avg_no_rnd
1833#undef op_put
1834#undef op_put_no_rnd
44eb4951 1835
0da71265
MN
1836#if 1
1837#define H264_LOWPASS(OPNAME, OP, OP2) \
1838static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1839 const int h=4;\
1840 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1841 int i;\
1842 for(i=0; i<h; i++)\
1843 {\
1844 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1845 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1846 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1847 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1848 dst+=dstStride;\
1849 src+=srcStride;\
1850 }\
1851}\
1852\
1853static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1854 const int w=4;\
1855 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1856 int i;\
1857 for(i=0; i<w; i++)\
1858 {\
1859 const int srcB= src[-2*srcStride];\
1860 const int srcA= src[-1*srcStride];\
1861 const int src0= src[0 *srcStride];\
1862 const int src1= src[1 *srcStride];\
1863 const int src2= src[2 *srcStride];\
1864 const int src3= src[3 *srcStride];\
1865 const int src4= src[4 *srcStride];\
1866 const int src5= src[5 *srcStride];\
1867 const int src6= src[6 *srcStride];\
1868 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1869 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1870 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1871 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1872 dst++;\
1873 src++;\
1874 }\
1875}\
1876\
1877static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1878 const int h=4;\
1879 const int w=4;\
1880 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1881 int i;\
1882 src -= 2*srcStride;\
1883 for(i=0; i<h+5; i++)\
1884 {\
1885 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1886 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1887 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1888 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1889 tmp+=tmpStride;\
1890 src+=srcStride;\
1891 }\
1892 tmp -= tmpStride*(h+5-2);\
1893 for(i=0; i<w; i++)\
1894 {\
1895 const int tmpB= tmp[-2*tmpStride];\
1896 const int tmpA= tmp[-1*tmpStride];\
1897 const int tmp0= tmp[0 *tmpStride];\
1898 const int tmp1= tmp[1 *tmpStride];\
1899 const int tmp2= tmp[2 *tmpStride];\
1900 const int tmp3= tmp[3 *tmpStride];\
1901 const int tmp4= tmp[4 *tmpStride];\
1902 const int tmp5= tmp[5 *tmpStride];\
1903 const int tmp6= tmp[6 *tmpStride];\
1904 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1905 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1906 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1907 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1908 dst++;\
1909 tmp++;\
1910 }\
1911}\
1912\
1913static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1914 const int h=8;\
1915 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1916 int i;\
1917 for(i=0; i<h; i++)\
1918 {\
1919 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1920 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1921 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1922 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1923 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1924 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1925 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1926 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1927 dst+=dstStride;\
1928 src+=srcStride;\
1929 }\
1930}\
1931\
1932static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1933 const int w=8;\
1934 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1935 int i;\
1936 for(i=0; i<w; i++)\
1937 {\
1938 const int srcB= src[-2*srcStride];\
1939 const int srcA= src[-1*srcStride];\
1940 const int src0= src[0 *srcStride];\
1941 const int src1= src[1 *srcStride];\
1942 const int src2= src[2 *srcStride];\
1943 const int src3= src[3 *srcStride];\
1944 const int src4= src[4 *srcStride];\
1945 const int src5= src[5 *srcStride];\
1946 const int src6= src[6 *srcStride];\
1947 const int src7= src[7 *srcStride];\
1948 const int src8= src[8 *srcStride];\
1949 const int src9= src[9 *srcStride];\
1950 const int src10=src[10*srcStride];\
1951 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1952 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1953 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1954 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1955 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1956 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1957 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1958 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1959 dst++;\
1960 src++;\
1961 }\
1962}\
1963\
1964static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1965 const int h=8;\
1966 const int w=8;\
1967 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1968 int i;\
1969 src -= 2*srcStride;\
1970 for(i=0; i<h+5; i++)\
1971 {\
1972 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1973 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1974 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1975 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1976 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1977 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1978 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1979 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1980 tmp+=tmpStride;\
1981 src+=srcStride;\
1982 }\
1983 tmp -= tmpStride*(h+5-2);\
1984 for(i=0; i<w; i++)\
1985 {\
1986 const int tmpB= tmp[-2*tmpStride];\
1987 const int tmpA= tmp[-1*tmpStride];\
1988 const int tmp0= tmp[0 *tmpStride];\
1989 const int tmp1= tmp[1 *tmpStride];\
1990 const int tmp2= tmp[2 *tmpStride];\
1991 const int tmp3= tmp[3 *tmpStride];\
1992 const int tmp4= tmp[4 *tmpStride];\
1993 const int tmp5= tmp[5 *tmpStride];\
1994 const int tmp6= tmp[6 *tmpStride];\
1995 const int tmp7= tmp[7 *tmpStride];\
1996 const int tmp8= tmp[8 *tmpStride];\
1997 const int tmp9= tmp[9 *tmpStride];\
1998 const int tmp10=tmp[10*tmpStride];\
1999 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2000 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2001 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2002 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2003 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2004 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2005 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2006 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2007 dst++;\
2008 tmp++;\
2009 }\
2010}\
2011\
2012static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2013 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2014 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2015 src += 8*srcStride;\
2016 dst += 8*dstStride;\
2017 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2018 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2019}\
2020\
2021static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2022 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2023 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2024 src += 8*srcStride;\
2025 dst += 8*dstStride;\
2026 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2027 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2028}\
2029\
2030static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2031 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2032 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2033 src += 8*srcStride;\
2034 tmp += 8*tmpStride;\
2035 dst += 8*dstStride;\
2036 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2037 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2038}\
2039
2040#define H264_MC(OPNAME, SIZE) \
2041static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2042 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2043}\
2044\
2045static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2046 uint8_t half[SIZE*SIZE];\
2047 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2048 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2049}\
2050\
2051static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2052 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2053}\
2054\
2055static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2056 uint8_t half[SIZE*SIZE];\
2057 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2058 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2059}\
2060\
2061static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[SIZE*(SIZE+5)];\
2063 uint8_t * const full_mid= full + SIZE*2;\
2064 uint8_t half[SIZE*SIZE];\
2065 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2066 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2067 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2068}\
2069\
2070static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2071 uint8_t full[SIZE*(SIZE+5)];\
2072 uint8_t * const full_mid= full + SIZE*2;\
2073 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2074 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2075}\
2076\
2077static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2078 uint8_t full[SIZE*(SIZE+5)];\
2079 uint8_t * const full_mid= full + SIZE*2;\
2080 uint8_t half[SIZE*SIZE];\
2081 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2082 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2083 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2084}\
2085\
2086static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2087 uint8_t full[SIZE*(SIZE+5)];\
2088 uint8_t * const full_mid= full + SIZE*2;\
2089 uint8_t halfH[SIZE*SIZE];\
2090 uint8_t halfV[SIZE*SIZE];\
2091 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2092 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2093 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2094 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2095}\
2096\
2097static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2098 uint8_t full[SIZE*(SIZE+5)];\
2099 uint8_t * const full_mid= full + SIZE*2;\
2100 uint8_t halfH[SIZE*SIZE];\
2101 uint8_t halfV[SIZE*SIZE];\
2102 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2103 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2104 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2105 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2106}\
2107\
2108static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2109 uint8_t full[SIZE*(SIZE+5)];\
2110 uint8_t * const full_mid= full + SIZE*2;\
2111 uint8_t halfH[SIZE*SIZE];\
2112 uint8_t halfV[SIZE*SIZE];\
2113 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2114 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2115 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2116 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2117}\
2118\
2119static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2120 uint8_t full[SIZE*(SIZE+5)];\
2121 uint8_t * const full_mid= full + SIZE*2;\
2122 uint8_t halfH[SIZE*SIZE];\
2123 uint8_t halfV[SIZE*SIZE];\
2124 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2125 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2126 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2127 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2128}\
2129\
2130static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2131 int16_t tmp[SIZE*(SIZE+5)];\
2132 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2133}\
2134\
2135static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2136 int16_t tmp[SIZE*(SIZE+5)];\
2137 uint8_t halfH[SIZE*SIZE];\
2138 uint8_t halfHV[SIZE*SIZE];\
2139 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2140 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2142}\
2143\
2144static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2145 int16_t tmp[SIZE*(SIZE+5)];\
2146 uint8_t halfH[SIZE*SIZE];\
2147 uint8_t halfHV[SIZE*SIZE];\
2148 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2149 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2150 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2151}\
2152\
2153static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2154 uint8_t full[SIZE*(SIZE+5)];\
2155 uint8_t * const full_mid= full + SIZE*2;\
2156 int16_t tmp[SIZE*(SIZE+5)];\
2157 uint8_t halfV[SIZE*SIZE];\
2158 uint8_t halfHV[SIZE*SIZE];\
2159 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2160 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2161 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2162 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2163}\
2164\
2165static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2166 uint8_t full[SIZE*(SIZE+5)];\
2167 uint8_t * const full_mid= full + SIZE*2;\
2168 int16_t tmp[SIZE*(SIZE+5)];\
2169 uint8_t halfV[SIZE*SIZE];\
2170 uint8_t halfHV[SIZE*SIZE];\
2171 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2172 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2173 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2174 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2175}\
2176
2177#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2178//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2179#define op_put(a, b) a = cm[((b) + 16)>>5]
2180#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2181#define op2_put(a, b) a = cm[((b) + 512)>>10]
2182
2183H264_LOWPASS(put_ , op_put, op2_put)
2184H264_LOWPASS(avg_ , op_avg, op2_avg)
2185H264_MC(put_, 4)
2186H264_MC(put_, 8)
2187H264_MC(put_, 16)
2188H264_MC(avg_, 4)
2189H264_MC(avg_, 8)
2190H264_MC(avg_, 16)
2191
2192#undef op_avg
2193#undef op_put
2194#undef op2_avg
2195#undef op2_put
2196#endif
2197
1457ab52
MN
2198static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2199 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2200 int i;
2201
2202 for(i=0; i<h; i++){
2203 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2204 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2205 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2206 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2207 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2208 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2209 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2210 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2211 dst+=dstStride;
2212 src+=srcStride;
2213 }
2214}
2215
2216static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2217 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2218 int i;
2219
2220 for(i=0; i<w; i++){
2221 const int src_1= src[ -srcStride];
2222 const int src0 = src[0 ];
2223 const int src1 = src[ srcStride];
2224 const int src2 = src[2*srcStride];
2225 const int src3 = src[3*srcStride];
2226 const int src4 = src[4*srcStride];
2227 const int src5 = src[5*srcStride];
2228 const int src6 = src[6*srcStride];
2229 const int src7 = src[7*srcStride];
2230 const int src8 = src[8*srcStride];
2231 const int src9 = src[9*srcStride];
2232 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2233 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2234 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2235 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2236 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2237 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2238 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2239 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2240 src++;
2241 dst++;
2242 }
2243}
2244
2245static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2246 put_pixels8_c(dst, src, stride, 8);
2247}
2248
2249static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2250 uint8_t half[64];
2251 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2252 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2253}
2254
2255static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2256 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2257}
2258
2259static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2260 uint8_t half[64];
2261 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2262 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2263}
2264
2265static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2266 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2267}
2268
2269static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2270 uint8_t halfH[88];
2271 uint8_t halfV[64];
2272 uint8_t halfHV[64];
2273 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2274 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2275 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2276 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2277}
2278static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2279 uint8_t halfH[88];
2280 uint8_t halfV[64];
2281 uint8_t halfHV[64];
2282 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2283 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2284 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2285 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2286}
2287static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2288 uint8_t halfH[88];
2289 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2290 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2291}
2292
332f9ac4
MN
2293static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2294 int x;
2295 const int strength= ff_h263_loop_filter_strength[qscale];
2296
2297 for(x=0; x<8; x++){
2298 int d1, d2, ad1;
2299 int p0= src[x-2*stride];
2300 int p1= src[x-1*stride];
2301 int p2= src[x+0*stride];
2302 int p3= src[x+1*stride];
2303 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2304
2305 if (d<-2*strength) d1= 0;
2306 else if(d<- strength) d1=-2*strength - d;
2307 else if(d< strength) d1= d;
2308 else if(d< 2*strength) d1= 2*strength - d;
2309 else d1= 0;
2310
2311 p1 += d1;
2312 p2 -= d1;
2313 if(p1&256) p1= ~(p1>>31);
2314 if(p2&256) p2= ~(p2>>31);
2315
2316 src[x-1*stride] = p1;
2317 src[x+0*stride] = p2;
2318
5b5404e3 2319 ad1= ABS(d1)>>1;
332f9ac4
MN
2320
2321 d2= clip((p0-p3)/4, -ad1, ad1);
2322
2323 src[x-2*stride] = p0 - d2;
2324 src[x+ stride] = p3 + d2;
2325 }
2326}
2327
2328static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2329 int y;
2330 const int strength= ff_h263_loop_filter_strength[qscale];
2331
2332 for(y=0; y<8; y++){
2333 int d1, d2, ad1;
2334 int p0= src[y*stride-2];
2335 int p1= src[y*stride-1];
2336 int p2= src[y*stride+0];
2337 int p3= src[y*stride+1];
2338 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2339
2340 if (d<-2*strength) d1= 0;
2341 else if(d<- strength) d1=-2*strength - d;
2342 else if(d< strength) d1= d;
2343 else if(d< 2*strength) d1= 2*strength - d;
2344 else d1= 0;
2345
2346 p1 += d1;
2347 p2 -= d1;
2348 if(p1&256) p1= ~(p1>>31);
2349 if(p2&256) p2= ~(p2>>31);
2350
2351 src[y*stride-1] = p1;
2352 src[y*stride+0] = p2;
2353
2354 ad1= ABS(d1)>>1;
2355
2356 d2= clip((p0-p3)/4, -ad1, ad1);
2357
2358 src[y*stride-2] = p0 - d2;
2359 src[y*stride+1] = p3 + d2;
2360 }
2361}
1457ab52 2362
fdbbf2e0
MN
2363static void h261_loop_filter_c(uint8_t *src, int stride){
2364 int x,y,xy,yz;
2365 int temp[64];
2366
2367 for(x=0; x<8; x++){
2368 temp[x ] = 4*src[x ];
2369 temp[x + 7*8] = 4*src[x + 7*stride];
2370 }
2371 for(y=1; y<7; y++){
2372 for(x=0; x<8; x++){
2373 xy = y * stride + x;
2374 yz = y * 8 + x;
2375 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
c6148de2
MN
2376 }
2377 }
fdbbf2e0
MN
2378
2379 for(y=0; y<8; y++){
2380 src[ y*stride] = (temp[ y*8] + 2)>>2;
2381 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2382 for(x=1; x<7; x++){
2383 xy = y * stride + x;
2384 yz = y * 8 + x;
2385 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
c6148de2
MN
2386 }
2387 }
2388}
2389
bb198e19 2390static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2391{
2392 int s, i;
2393
2394 s = 0;
bb198e19 2395 for(i=0;i<h;i++) {
de6d9b64
FB
2396 s += abs(pix1[0] - pix2[0]);
2397 s += abs(pix1[1] - pix2[1]);
2398 s += abs(pix1[2] - pix2[2]);
2399 s += abs(pix1[3] - pix2[3]);
2400 s += abs(pix1[4] - pix2[4]);
2401 s += abs(pix1[5] - pix2[5]);
2402 s += abs(pix1[6] - pix2[6]);
2403 s += abs(pix1[7] - pix2[7]);
2404 s += abs(pix1[8] - pix2[8]);
2405 s += abs(pix1[9] - pix2[9]);
2406 s += abs(pix1[10] - pix2[10]);
2407 s += abs(pix1[11] - pix2[11]);
2408 s += abs(pix1[12] - pix2[12]);
2409 s += abs(pix1[13] - pix2[13]);
2410 s += abs(pix1[14] - pix2[14]);
2411 s += abs(pix1[15] - pix2[15]);
2412 pix1 += line_size;
2413 pix2 += line_size;
2414 }
2415 return s;
2416}
2417
bb198e19 2418static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2419{
2420 int s, i;
2421
2422 s = 0;
bb198e19 2423 for(i=0;i<h;i++) {
de6d9b64
FB
2424 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2425 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2426 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2427 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2428 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2429 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2430 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2431 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2432 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2433 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2434 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2435 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2436 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2437 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2438 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2439 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2440 pix1 += line_size;
2441 pix2 += line_size;
2442 }
2443 return s;
2444}
2445
bb198e19 2446static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2447{
2448 int s, i;
0c1a9eda 2449 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2450
2451 s = 0;
bb198e19 2452 for(i=0;i<h;i++) {
de6d9b64
FB
2453 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2454 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2455 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2456 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2457 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2458 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2459 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2460 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2461 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2462 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2463 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2464 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2465 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2466 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2467 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2468 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2469 pix1 += line_size;
2470 pix2 += line_size;
2471 pix3 += line_size;
2472 }
2473 return s;
2474}
2475
bb198e19 2476static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2477{
2478 int s, i;
0c1a9eda 2479 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2480
2481 s = 0;
bb198e19 2482 for(i=0;i<h;i++) {
de6d9b64
FB
2483 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2484 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2485 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2486 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2487 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2488 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2489 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2490 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2491 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2492 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2493 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2494 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2495 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2496 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2497 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2498 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2499 pix1 += line_size;
2500 pix2 += line_size;
2501 pix3 += line_size;
2502 }
2503 return s;
2504}
2505
bb198e19 2506static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2507{
2508 int s, i;
2509
2510 s = 0;
bb198e19 2511 for(i=0;i<h;i++) {
ba6802de
MN
2512 s += abs(pix1[0] - pix2[0]);
2513 s += abs(pix1[1] - pix2[1]);
2514 s += abs(pix1[2] - pix2[2]);
2515 s += abs(pix1[3] - pix2[3]);
2516 s += abs(pix1[4] - pix2[4]);
2517 s += abs(pix1[5] - pix2[5]);
2518 s += abs(pix1[6] - pix2[6]);
2519 s += abs(pix1[7] - pix2[7]);
2520 pix1 += line_size;
2521 pix2 += line_size;
2522 }
2523 return s;
2524}
2525
bb198e19 2526static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2527{
2528 int s, i;
2529
2530 s = 0;
bb198e19 2531 for(i=0;i<h;i++) {
ba6802de
MN
2532 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2533 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2534 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2535 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2536 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2537 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2538 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2539 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2540 pix1 += line_size;
2541 pix2 += line_size;
2542 }
2543 return s;
2544}
2545
bb198e19 2546static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2547{
2548 int s, i;
0c1a9eda 2549 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2550
2551 s = 0;
bb198e19 2552 for(i=0;i<h;i++) {
ba6802de
MN
2553 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2554 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2555 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2556 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2557 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2558 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2559 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2560 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2561 pix1 += line_size;
2562 pix2 += line_size;
2563 pix3 += line_size;
2564 }
2565 return s;
2566}
2567
bb198e19 2568static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2569{
2570 int s, i;
0c1a9eda 2571 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2572
2573 s = 0;
bb198e19 2574 for(i=0;i<h;i++) {
ba6802de
MN
2575 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2576 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2577 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2578 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2579 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2580 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2581 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2582 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2583 pix1 += line_size;
2584 pix2 += line_size;
2585 pix3 += line_size;
2586 }
2587 return s;
2588}
2589
d4c5d2ad 2590static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
e6a2ac34
MN
2591 int score1=0;
2592 int score2=0;
2593 int x,y;
d4c5d2ad 2594
e6a2ac34
MN
2595 for(y=0; y<h; y++){
2596 for(x=0; x<16; x++){
2597 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2598 }
2599 if(y+1<h){
2600 for(x=0; x<15; x++){
2601 score2+= ABS( s1[x ] - s1[x +stride]
2602 - s1[x+1] + s1[x+1+stride])
2603 -ABS( s2[x ] - s2[x +stride]
2604 - s2[x+1] + s2[x+1+stride]);
2605 }
2606 }
2607 s1+= stride;
2608 s2+= stride;
2609 }
d4c5d2ad
MN
2610
2611 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2612 else return score1 + ABS(score2)*8;
e6a2ac34
MN
2613}
2614
d4c5d2ad 2615static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
e6a2ac34
MN
2616 int score1=0;
2617 int score2=0;
2618 int x,y;
2619
2620 for(y=0; y<h; y++){
2621 for(x=0; x<8; x++){
2622 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2623 }
2624 if(y+1<h){
2625 for(x=0; x<7; x++){
2626 score2+= ABS( s1[x ] - s1[x +stride]
2627 - s1[x+1] + s1[x+1+stride])
2628 -ABS( s2[x ] - s2[x +stride]
2629 - s2[x+1] + s2[x+1+stride]);
2630 }
2631 }
2632 s1+= stride;
2633 s2+= stride;
2634 }
2635
d4c5d2ad
MN
2636 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2637 else return score1 + ABS(score2)*8;
e6a2ac34
MN
2638}
2639
364a1797
MN
2640static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2641 int i;
2642 unsigned int sum=0;
2643
2644 for(i=0; i<8*8; i++){
2645 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2646 int w= weight[i];
2647 b>>= RECON_SHIFT;
2648 assert(-512<b && b<512);
2649
2650 sum += (w*b)*(w*b)>>4;
2651 }
2652 return sum>>2;
2653}
2654
2655static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2656 int i;
2657
2658 for(i=0; i<8*8; i++){
2659 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2660 }
2661}
2662
a9badb51
MN
2663/**
2664 * permutes an 8x8 block.
2a5700de 2665 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
2666 * @param permutation the permutation vector
2667 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2a5700de
MN
2668 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2669 * (inverse) permutated to scantable order!
a9badb51 2670 */
0c1a9eda 2671void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 2672{
7801d21d 2673 int i;
477ab036 2674 DCTELEM temp[64];
7801d21d
MN
2675
2676 if(last<=0) return;
9a7b310d 2677 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 2678
7801d21d
MN
2679 for(i=0; i<=last; i++){
2680 const int j= scantable[i];
2681 temp[j]= block[j];
2682 block[j]=0;
2683 }
2684
2685 for(i=0; i<=last; i++){
2686 const int j= scantable[i];
2687 const int perm_j= permutation[j];
2688 block[perm_j]= temp[j];
2689 }
d962f6fd 2690}
e0eac44e 2691
622348f9
MN
2692static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2693 return 0;
2694}
2695
2696void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2697 int i;
2698
2699 memset(cmp, 0, sizeof(void*)*5);
2700
2701 for(i=0; i<5; i++){
2702 switch(type&0xFF){
2703 case FF_CMP_SAD:
2704 cmp[i]= c->sad[i];
2705 break;
2706 case FF_CMP_SATD:
2707 cmp[i]= c->hadamard8_diff[i];
2708 break;
2709 case FF_CMP_SSE:
2710 cmp[i]= c->sse[i];
2711 break;
2712 case FF_CMP_DCT:
2713 cmp[i]= c->dct_sad[i];
2714 break;
2715 case FF_CMP_PSNR:
2716 cmp[i]= c->quant_psnr[i];
2717 break;
2718 case FF_CMP_BIT:
2719 cmp[i]= c->bit[i];
2720 break;
2721 case FF_CMP_RD:
2722 cmp[i]= c->rd[i];
2723 break;
2724 case FF_CMP_VSAD:
2725 cmp[i]= c->vsad[i];
2726 break;
2727 case FF_CMP_VSSE:
2728 cmp[i]= c->vsse[i];
2729 break;
2730 case FF_CMP_ZERO:
2731 cmp[i]= zero_cmp;
2732 break;
e6a2ac34
MN
2733 case FF_CMP_NSSE:
2734 cmp[i]= c->nsse[i];
2735 break;
622348f9
MN
2736 default:
2737 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2738 }
2739 }
2740}
2741
2a5700de
MN
2742/**
2743 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2744 */
eb4b3dd3 2745static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
2746{
2747 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2748}
2749
11f18faf
MN
2750static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2751 int i;
d32ac509 2752 for(i=0; i+7<w; i+=8){
11f18faf
MN
2753 dst[i+0] += src[i+0];
2754 dst[i+1] += src[i+1];
2755 dst[i+2] += src[i+2];
2756 dst[i+3] += src[i+3];
2757 dst[i+4] += src[i+4];
2758 dst[i+5] += src[i+5];
2759 dst[i+6] += src[i+6];
2760 dst[i+7] += src[i+7];
2761 }
2762 for(; i<w; i++)
2763 dst[i+0] += src[i+0];
2764}
2765
2766static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2767 int i;
d32ac509 2768 for(i=0; i+7<w; i+=8){
11f18faf
MN
2769 dst[i+0] = src1[i+0]-src2[i+0];
2770 dst[i+1] = src1[i+1]-src2[i+1];
2771 dst[i+2] = src1[i+2]-src2[i+2];
2772 dst[i+3] = src1[i+3]-src2[i+3];
2773 dst[i+4] = src1[i+4]-src2[i+4];
2774 dst[i+5] = src1[i+5]-src2[i+5];
2775 dst[i+6] = src1[i+6]-src2[i+6];
2776 dst[i+7] = src1[i+7]-src2[i+7];
2777 }
2778 for(; i<w; i++)
2779 dst[i+0] = src1[i+0]-src2[i+0];
2780}
2781
84705403
MN
2782static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2783 int i;
2784 uint8_t l, lt;
2785
2786 l= *left;
2787 lt= *left_top;
2788
2789 for(i=0; i<w; i++){
2790 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2791 lt= src1[i];
2792 l= src2[i];
2793 dst[i]= l - pred;
2794 }
2795
2796 *left= l;
2797 *left_top= lt;
2798}
2799
1457ab52
MN
2800#define BUTTERFLY2(o1,o2,i1,i2) \
2801o1= (i1)+(i2);\
2802o2= (i1)-(i2);
2803
2804#define BUTTERFLY1(x,y) \
2805{\
2806 int a,b;\
2807 a= x;\
2808 b= y;\
2809 x= a+b;\
2810 y= a-b;\
2811}
2812
2813#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2814
bb198e19 2815static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1457ab52
MN
2816 int i;
2817 int temp[64];
2818 int sum=0;
bb198e19
MN
2819
2820 assert(h==8);
1457ab52
MN
2821
2822 for(i=0; i<8; i++){
2823 //FIXME try pointer walks
2824 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2825 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2826 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2827 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2828
2829 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2830 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2831 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2832 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2833
2834 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2835 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2836 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2837 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2838 }
2839
2840 for(i=0; i<8; i++){
2841 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2842 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2843 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2844 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2845
2846 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2847 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2848 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2849 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2850
2851 sum +=
2852 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2853 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2854 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2855 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2856 }
2857#if 0
2858static int maxi=0;
2859if(sum>maxi){
2860 maxi=sum;
2861 printf("MAX:%d\n", maxi);
2862}
2863#endif
2864 return sum;
2865}
2866
622348f9 2867static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1457ab52
MN
2868 int i;
2869 int temp[64];
2870 int sum=0;
622348f9
MN
2871
2872 assert(h==8);
2873
1457ab52
MN
2874 for(i=0; i<8; i++){
2875 //FIXME try pointer walks
622348f9
MN
2876 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2877 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2878 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2879 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1457ab52
MN
2880
2881 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2882 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2883 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2884 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2885
2886 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2887 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2888 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2889 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2890 }
2891
2892 for(i=0; i<8; i++){
2893 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2894 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2895 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2896 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2897
2898 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2899 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2900 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2901 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2902
2903 sum +=
2904 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2905 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2906 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2907 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2908 }
2909
622348f9
MN
2910 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2911
1457ab52
MN
2912 return sum;
2913}
2914
bb198e19 2915static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 2916 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
2917 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2918 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1457ab52 2919 int sum=0, i;
bb198e19
MN
2920
2921 assert(h==8);
1457ab52
MN
2922
2923 s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839 2924 s->dsp.fdct(temp);
1457ab52
MN
2925
2926 for(i=0; i<64; i++)
2927 sum+= ABS(temp[i]);
2928
2929 return sum;
2930}
2931
0e15384d 2932void simple_idct(DCTELEM *block); //FIXME
1457ab52 2933
bb198e19 2934static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 2935 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
2936 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2937 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2938 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1457ab52
MN
2939 int sum=0, i;
2940
bb198e19 2941 assert(h==8);
1457ab52
MN
2942 s->mb_intra=0;
2943
2944 s->dsp.diff_pixels(temp, src1, src2, stride);
2945
2946 memcpy(bak, temp, 64*sizeof(DCTELEM));
2947
67725183 2948 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
d50635cd 2949 s->dct_unquantize_inter(s, temp, 0, s->qscale);
1457ab52
MN
2950 simple_idct(temp); //FIXME
2951
2952 for(i=0; i<64; i++)
2953 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2954
2955 return sum;
2956}
2957
bb198e19 2958static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 2959 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 2960 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
2961 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2962 uint64_t __align8 aligned_bak[stride];
2963 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2964 uint8_t * const bak= (uint8_t*)aligned_bak;
3a87ac94
MN
2965 int i, last, run, bits, level, distoration, start_i;
2966 const int esc_length= s->ac_esc_length;
2967 uint8_t * length;
2968 uint8_t * last_length;
67725183 2969
bb198e19
MN
2970 assert(h==8);
2971
67725183
MN
2972 for(i=0; i<8; i++){
2973 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2974 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2975 }
3a87ac94 2976
67725183
MN
2977 s->dsp.diff_pixels(temp, src1, src2, stride);
2978
2979 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2980
2981 bits=0;
3a87ac94
MN
2982
2983 if (s->mb_intra) {
67725183 2984 start_i = 1;
3a87ac94
MN
2985 length = s->intra_ac_vlc_length;
2986 last_length= s->intra_ac_vlc_last_length;
67725183 2987 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
2988 } else {
2989 start_i = 0;
2990 length = s->inter_ac_vlc_length;
2991 last_length= s->inter_ac_vlc_last_length;
2992 }
3a87ac94 2993
67725183 2994 if(last>=start_i){
3a87ac94
MN
2995 run=0;
2996 for(i=start_i; i<last; i++){
2997 int j= scantable[i];
2998 level= temp[j];
2999
3000 if(level){
3001 level+=64;
3002 if((level&(~127)) == 0){
3003 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3004 }else
3005 bits+= esc_length;
3006 run=0;
3007 }else
3008 run++;
3009 }
3010 i= scantable[last];
1d0eab1d 3011
3a87ac94 3012 level= temp[i] + 64;
1d0eab1d
MN
3013
3014 assert(level - 64);
3015
3a87ac94
MN
3016 if((level&(~127)) == 0){
3017 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3018 }else
3019 bits+= esc_length;
3020
67725183
MN
3021 }
3022
3023 if(last>=0){
d50635cd
MN
3024 if(s->mb_intra)
3025 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3026 else
3027 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3a87ac94
MN
3028 }
3029
b0368839 3030 s->dsp.idct_add(bak, stride, temp);
3a87ac94 3031
bb198e19 3032 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3a87ac94 3033
67725183 3034 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3a87ac94
MN
3035}
3036
bb198e19 3037static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3038 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3039 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
3040 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3041 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3a87ac94
MN
3042 int i, last, run, bits, level, start_i;
3043 const int esc_length= s->ac_esc_length;
3044 uint8_t * length;
3045 uint8_t * last_length;
bb198e19
MN
3046
3047 assert(h==8);
67725183
MN
3048
3049 s->dsp.diff_pixels(temp, src1, src2, stride);
3a87ac94 3050
67725183
MN
3051 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3052
3053 bits=0;
3a87ac94
MN
3054
3055 if (s->mb_intra) {
67725183 3056 start_i = 1;
3a87ac94
MN
3057 length = s->intra_ac_vlc_length;
3058 last_length= s->intra_ac_vlc_last_length;
67725183 3059 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3060 } else {
3061 start_i = 0;
3062 length = s->inter_ac_vlc_length;
3063 last_length= s->inter_ac_vlc_last_length;
3064 }
3a87ac94 3065
67725183 3066 if(last>=start_i){
3a87ac94
MN
3067 run=0;
3068 for(i=start_i; i<last; i++){
3069 int j= scantable[i];
3070 level= temp[j];
3071
3072 if(level){
3073 level+=64;
3074 if((level&(~127)) == 0){
3075 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3076 }else
3077 bits+= esc_length;
3078 run=0;
3079 }else
3080 run++;
3081 }
3082 i= scantable[last];
67725183
MN
3083
3084 level= temp[i] + 64;
3a87ac94 3085
67725183 3086 assert(level - 64);
3a87ac94 3087
3a87ac94
MN
3088 if((level&(~127)) == 0){
3089 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3090 }else
3091 bits+= esc_length;
3092 }
3093
3094 return bits;
3095}
3096
622348f9
MN
3097static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3098 int score=0;
3099 int x,y;
3100
3101 for(y=1; y<h; y++){
3102 for(x=0; x<16; x+=4){
3103 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3104 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3105 }
3106 s+= stride;
3107 }
3108
3109 return score;
3110}
3111
3112static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3113 int score=0;
3114 int x,y;
3115
3116 for(y=1; y<h; y++){
3117 for(x=0; x<16; x++){
3118 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3119 }
3120 s1+= stride;
3121 s2+= stride;
3122 }
3123
3124 return score;
3125}
3126
3127#define SQ(a) ((a)*(a))
3128static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3129 int score=0;
3130 int x,y;
3131
3132 for(y=1; y<h; y++){
3133 for(x=0; x<16; x+=4){
3134 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3135 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3136 }
3137 s+= stride;
3138 }
3139
3140 return score;
3141}
3142
3143static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3144 int score=0;
3145 int x,y;
3146
3147 for(y=1; y<h; y++){
3148 for(x=0; x<16; x++){
3149 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3150 }
3151 s1+= stride;
3152 s2+= stride;
3153 }
3154
3155 return score;
3156}
3157
bb198e19 3158WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
622348f9 3159WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
bb198e19
MN
3160WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3161WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3162WARPER8_16_SQ(rd8x8_c, rd16_c)
3163WARPER8_16_SQ(bit8x8_c, bit16_c)
1457ab52 3164
b0368839
MN
3165/* XXX: those functions should be suppressed ASAP when all IDCTs are
3166 converted */
3167static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3168{
3169 j_rev_dct (block);
3170 put_pixels_clamped_c(block, dest, line_size);
3171}
3172static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3173{
3174 j_rev_dct (block);
3175 add_pixels_clamped_c(block, dest, line_size);
3176}
3177
59cf08ce
FB
3178/* init static data */
3179void dsputil_static_init(void)
e0eac44e 3180{
d2975f8d 3181 int i;
e0eac44e 3182
59cf08ce
FB
3183 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3184 for(i=0;i<MAX_NEG_CROP;i++) {
3185 cropTbl[i] = 0;
3186 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3187 }
3188
3189 for(i=0;i<512;i++) {
3190 squareTbl[i] = (i - 256) * (i - 256);
3191 }
3192
3193 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3194}
92ddb692 3195
92ddb692 3196
59cf08ce
FB
3197void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3198{
3199 int i;
de6d9b64 3200
b0368839 3201#ifdef CONFIG_ENCODERS
10acc479 3202 if(avctx->dct_algo==FF_DCT_FASTINT) {
b0368839 3203 c->fdct = fdct_ifast;
48b1f800 3204 c->fdct248 = fdct_ifast248;
10acc479
RS
3205 }
3206 else if(avctx->dct_algo==FF_DCT_FAAN) {
65e4c8c9 3207 c->fdct = ff_faandct;
48b1f800 3208 c->fdct248 = ff_faandct248;
10acc479
RS
3209 }
3210 else {
b0368839 3211 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
10acc479
RS
3212 c->fdct248 = ff_fdct248_islow;
3213 }
b0368839
MN
3214#endif //CONFIG_ENCODERS
3215
3216 if(avctx->idct_algo==FF_IDCT_INT){
3217 c->idct_put= ff_jref_idct_put;
3218 c->idct_add= ff_jref_idct_add;
4fb518c3 3219 c->idct = j_rev_dct;
b0368839
MN
3220 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3221 }else{ //accurate/default
3222 c->idct_put= simple_idct_put;
3223 c->idct_add= simple_idct_add;
4fb518c3 3224 c->idct = simple_idct;
b0368839
MN
3225 c->idct_permutation_type= FF_NO_IDCT_PERM;
3226 }
3227
44cb64ee
MM
3228 /* VP3 DSP support */
3229 c->vp3_dsp_init = vp3_dsp_init_c;
116824d0 3230 c->vp3_idct = vp3_idct_c;
44cb64ee 3231
eb4b3dd3
ZK
3232 c->get_pixels = get_pixels_c;
3233 c->diff_pixels = diff_pixels_c;
3234 c->put_pixels_clamped = put_pixels_clamped_c;
f9ed9d85 3235 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
eb4b3dd3
ZK
3236 c->add_pixels_clamped = add_pixels_clamped_c;
3237 c->gmc1 = gmc1_c;
3238 c->gmc = gmc_c;
3239 c->clear_blocks = clear_blocks_c;
3240 c->pix_sum = pix_sum_c;
3241 c->pix_norm1 = pix_norm1_c;
3242
45553457 3243 /* TODO [0] 16 [1] 8 */
bb198e19
MN
3244 c->pix_abs[0][0] = pix_abs16_c;
3245 c->pix_abs[0][1] = pix_abs16_x2_c;
3246 c->pix_abs[0][2] = pix_abs16_y2_c;
3247 c->pix_abs[0][3] = pix_abs16_xy2_c;
3248 c->pix_abs[1][0] = pix_abs8_c;
3249 c->pix_abs[1][1] = pix_abs8_x2_c;
3250 c->pix_abs[1][2] = pix_abs8_y2_c;
3251 c->pix_abs[1][3] = pix_abs8_xy2_c;
eb4b3dd3 3252
45553457
ZK
3253#define dspfunc(PFX, IDX, NUM) \
3254 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3255 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3256 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3257 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3258
3259 dspfunc(put, 0, 16);
3260 dspfunc(put_no_rnd, 0, 16);
3261 dspfunc(put, 1, 8);
3262 dspfunc(put_no_rnd, 1, 8);
669ac79c
MN
3263 dspfunc(put, 2, 4);
3264 dspfunc(put, 3, 2);
45553457
ZK
3265
3266 dspfunc(avg, 0, 16);
3267 dspfunc(avg_no_rnd, 0, 16);
3268 dspfunc(avg, 1, 8);
3269 dspfunc(avg_no_rnd, 1, 8);
da3b9756
MM
3270 dspfunc(avg, 2, 4);
3271 dspfunc(avg, 3, 2);
45553457
ZK
3272#undef dspfunc
3273
c0a0170c
MN
3274 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3275 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3276
669ac79c
MN
3277 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3278 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3279 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3280 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3281 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3282 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3283 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3284 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3285 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3286
da3b9756
MM
3287 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3288 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3289 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3290 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3291 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3292 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3293 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3294 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3295 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3296
45553457
ZK
3297#define dspfunc(PFX, IDX, NUM) \
3298 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3299 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3300 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3301 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3302 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3303 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3304 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3305 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3306 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3307 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3308 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3309 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3310 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3311 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3312 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3313 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3314
3315 dspfunc(put_qpel, 0, 16);
3316 dspfunc(put_no_rnd_qpel, 0, 16);
3317
3318 dspfunc(avg_qpel, 0, 16);
3319 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3320
3321 dspfunc(put_qpel, 1, 8);
3322 dspfunc(put_no_rnd_qpel, 1, 8);
3323
3324 dspfunc(avg_qpel, 1, 8);
3325 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
0da71265
MN
3326
3327 dspfunc(put_h264_qpel, 0, 16);
3328 dspfunc(put_h264_qpel, 1, 8);
3329 dspfunc(put_h264_qpel, 2, 4);
3330 dspfunc(avg_h264_qpel, 0, 16);
3331 dspfunc(avg_h264_qpel, 1, 8);
3332 dspfunc(avg_h264_qpel, 2, 4);
3333
45553457 3334#undef dspfunc
0da71265
MN
3335 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3336 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3337 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3338 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3339 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3340 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
c9a2ebc4 3341
1457ab52
MN
3342 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3343 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3344 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3345 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3346 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3347 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3348 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3349 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
669ac79c 3350
bb198e19
MN
3351#define SET_CMP_FUNC(name) \
3352 c->name[0]= name ## 16_c;\
3353 c->name[1]= name ## 8x8_c;
3354
3355 SET_CMP_FUNC(hadamard8_diff)
622348f9 3356 c->hadamard8_diff[4]= hadamard8_intra16_c;
bb198e19
MN
3357 SET_CMP_FUNC(dct_sad)
3358 c->sad[0]= pix_abs16_c;
3359 c->sad[1]= pix_abs8_c;
3360 c->sse[0]= sse16_c;
3361 c->sse[1]= sse8_c;
3362 SET_CMP_FUNC(quant_psnr)
3363 SET_CMP_FUNC(rd)
3364 SET_CMP_FUNC(bit)
622348f9
MN
3365 c->vsad[0]= vsad16_c;
3366 c->vsad[4]= vsad_intra16_c;
3367 c->vsse[0]= vsse16_c;
3368 c->vsse[4]= vsse_intra16_c;
e6a2ac34
MN
3369 c->nsse[0]= nsse16_c;
3370 c->nsse[1]= nsse8_c;
3a87ac94 3371
11f18faf
MN
3372 c->add_bytes= add_bytes_c;
3373 c->diff_bytes= diff_bytes_c;
84705403 3374 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3d2e8cce 3375 c->bswap_buf= bswap_buf;
332f9ac4
MN
3376
3377 c->h263_h_loop_filter= h263_h_loop_filter_c;
3378 c->h263_v_loop_filter= h263_v_loop_filter_c;
364a1797 3379
fdbbf2e0 3380 c->h261_loop_filter= h261_loop_filter_c;
c6148de2 3381
364a1797
MN
3382 c->try_8x8basis= try_8x8basis_c;
3383 c->add_8x8basis= add_8x8basis_c;
11f18faf 3384
980fc7b8 3385#ifdef HAVE_MMX
b0368839 3386 dsputil_init_mmx(c, avctx);
de6d9b64 3387#endif
3d03c0a2 3388#ifdef ARCH_ARMV4L
b0368839 3389 dsputil_init_armv4l(c, avctx);
3d03c0a2 3390#endif
c34270f5 3391#ifdef HAVE_MLIB
b0368839 3392 dsputil_init_mlib(c, avctx);
c34270f5 3393#endif
44f54ceb
MN
3394#ifdef ARCH_SPARC
3395 dsputil_init_vis(c,avctx);
3396#endif
1e98dffb 3397#ifdef ARCH_ALPHA
b0368839 3398 dsputil_init_alpha(c, avctx);
1e98dffb 3399#endif
59925ef2 3400#ifdef ARCH_POWERPC
b0368839 3401 dsputil_init_ppc(c, avctx);
a43bd1d7 3402#endif
d46aba26 3403#ifdef HAVE_MMI
b0368839 3404 dsputil_init_mmi(c, avctx);
d46aba26 3405#endif
0c6bd2ea
B
3406#ifdef ARCH_SH4
3407 dsputil_init_sh4(c,avctx);
3408#endif
43f1708f 3409
b0368839
MN
3410 switch(c->idct_permutation_type){
3411 case FF_NO_IDCT_PERM:
3412 for(i=0; i<64; i++)
3413 c->idct_permutation[i]= i;
3414 break;
3415 case FF_LIBMPEG2_IDCT_PERM:
3416 for(i=0; i<64; i++)
3417 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3418 break;
3419 case FF_SIMPLE_IDCT_PERM:
3420 for(i=0; i<64; i++)
3421 c->idct_permutation[i]= simple_mmx_permutation[i];
3422 break;
3423 case FF_TRANSPOSE_IDCT_PERM:
3424 for(i=0; i<64; i++)
3425 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3426 break;
3427 default:
9b879566 3428 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
b0368839 3429 }
57060b1e 3430}
b0368839 3431