h261 decoder by (Maarten Daniels <maarten.daniels at student dot luc dot ac dot be>)
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
ff4ec49e
FB
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
de6d9b64 10 *
ff4ec49e 11 * This library is distributed in the hope that it will be useful,
de6d9b64 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
de6d9b64 15 *
ff4ec49e
FB
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 19 *
59fe111e 20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 21 */
983e3246
MN
22
23/**
24 * @file dsputil.c
25 * DSP utils
26 */
27
de6d9b64
FB
28#include "avcodec.h"
29#include "dsputil.h"
1457ab52 30#include "mpegvideo.h"
b0368839 31#include "simple_idct.h"
65e4c8c9 32#include "faandct.h"
5596c60c 33
0c1a9eda
ZK
34uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35uint32_t squareTbl[512];
de6d9b64 36
0c1a9eda 37const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 40 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 41 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46};
47
10acc479
RS
48/* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59};
60
2f349de2 61/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
0c1a9eda 62uint16_t __align8 inv_zigzag_direct16[64];
2f349de2 63
0c1a9eda 64const uint8_t ff_alternate_horizontal_scan[64] = {
2ad1516a 65 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73};
74
0c1a9eda 75const uint8_t ff_alternate_vertical_scan[64] = {
2ad1516a 76 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84};
85
2f349de2 86/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
0c1a9eda 87const uint32_t inverse[256]={
2f349de2
MN
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120};
121
b0368839
MN
122/* Input permutation for the simple_idct_mmx */
123static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132};
133
0c1a9eda 134static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
135{
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154}
155
0c1a9eda 156static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
157{
158 int s, i, j;
0c1a9eda 159 uint32_t *sq = squareTbl + 256;
3aa102be
MN
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
2a006cd3 164#if 0
3aa102be
MN
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
2a006cd3
FL
173#else
174#if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184#else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195#endif
196#endif
3aa102be
MN
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202}
203
3d2e8cce
MN
204static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220}
3aa102be 221
bb198e19 222static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
MN
223{
224 int s, i;
0c1a9eda 225 uint32_t *sq = squareTbl + 256;
1457ab52
MN
226
227 s = 0;
bb198e19 228 for (i = 0; i < h; i++) {
1457ab52
MN
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 s += sq[pix1[4] - pix2[4]];
234 s += sq[pix1[5] - pix2[5]];
235 s += sq[pix1[6] - pix2[6]];
236 s += sq[pix1[7] - pix2[7]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241}
242
bb198e19 243static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48 244{
6b026927
FH
245 int s, i;
246 uint32_t *sq = squareTbl + 256;
9c76bd48
BF
247
248 s = 0;
bb198e19 249 for (i = 0; i < h; i++) {
6b026927
FH
250 s += sq[pix1[ 0] - pix2[ 0]];
251 s += sq[pix1[ 1] - pix2[ 1]];
252 s += sq[pix1[ 2] - pix2[ 2]];
253 s += sq[pix1[ 3] - pix2[ 3]];
254 s += sq[pix1[ 4] - pix2[ 4]];
255 s += sq[pix1[ 5] - pix2[ 5]];
256 s += sq[pix1[ 6] - pix2[ 6]];
257 s += sq[pix1[ 7] - pix2[ 7]];
258 s += sq[pix1[ 8] - pix2[ 8]];
259 s += sq[pix1[ 9] - pix2[ 9]];
260 s += sq[pix1[10] - pix2[10]];
261 s += sq[pix1[11] - pix2[11]];
262 s += sq[pix1[12] - pix2[12]];
263 s += sq[pix1[13] - pix2[13]];
264 s += sq[pix1[14] - pix2[14]];
265 s += sq[pix1[15] - pix2[15]];
2a006cd3 266
6b026927
FH
267 pix1 += line_size;
268 pix2 += line_size;
9c76bd48
BF
269 }
270 return s;
271}
272
0c1a9eda 273static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 274{
de6d9b64
FB
275 int i;
276
277 /* read the pixels */
de6d9b64 278 for(i=0;i<8;i++) {
c13e1abd
FH
279 block[0] = pixels[0];
280 block[1] = pixels[1];
281 block[2] = pixels[2];
282 block[3] = pixels[3];
283 block[4] = pixels[4];
284 block[5] = pixels[5];
285 block[6] = pixels[6];
286 block[7] = pixels[7];
287 pixels += line_size;
288 block += 8;
de6d9b64
FB
289 }
290}
291
0c1a9eda
ZK
292static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293 const uint8_t *s2, int stride){
9dbcbd92
MN
294 int i;
295
296 /* read the pixels */
9dbcbd92 297 for(i=0;i<8;i++) {
c13e1abd
FH
298 block[0] = s1[0] - s2[0];
299 block[1] = s1[1] - s2[1];
300 block[2] = s1[2] - s2[2];
301 block[3] = s1[3] - s2[3];
302 block[4] = s1[4] - s2[4];
303 block[5] = s1[5] - s2[5];
304 block[6] = s1[6] - s2[6];
305 block[7] = s1[7] - s2[7];
9dbcbd92
MN
306 s1 += stride;
307 s2 += stride;
c13e1abd 308 block += 8;
9dbcbd92
MN
309 }
310}
311
312
0c1a9eda 313static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
eb4b3dd3 314 int line_size)
de6d9b64 315{
de6d9b64 316 int i;
0c1a9eda 317 uint8_t *cm = cropTbl + MAX_NEG_CROP;
de6d9b64
FB
318
319 /* read the pixels */
de6d9b64 320 for(i=0;i<8;i++) {
c13e1abd
FH
321 pixels[0] = cm[block[0]];
322 pixels[1] = cm[block[1]];
323 pixels[2] = cm[block[2]];
324 pixels[3] = cm[block[3]];
325 pixels[4] = cm[block[4]];
326 pixels[5] = cm[block[5]];
327 pixels[6] = cm[block[6]];
328 pixels[7] = cm[block[7]];
329
330 pixels += line_size;
331 block += 8;
de6d9b64
FB
332 }
333}
334
f9ed9d85
MM
335static void put_signed_pixels_clamped_c(const DCTELEM *block,
336 uint8_t *restrict pixels,
337 int line_size)
338{
339 int i, j;
340
341 for (i = 0; i < 8; i++) {
342 for (j = 0; j < 8; j++) {
343 if (*block < -128)
344 *pixels = 0;
345 else if (*block > 127)
346 *pixels = 255;
347 else
348 *pixels = (uint8_t)(*block + 128);
349 block++;
350 pixels++;
351 }
352 pixels += (line_size - 8);
353 }
354}
355
0c1a9eda 356static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 357 int line_size)
de6d9b64 358{
de6d9b64 359 int i;
0c1a9eda 360 uint8_t *cm = cropTbl + MAX_NEG_CROP;
de6d9b64
FB
361
362 /* read the pixels */
de6d9b64 363 for(i=0;i<8;i++) {
c13e1abd
FH
364 pixels[0] = cm[pixels[0] + block[0]];
365 pixels[1] = cm[pixels[1] + block[1]];
366 pixels[2] = cm[pixels[2] + block[2]];
367 pixels[3] = cm[pixels[3] + block[3]];
368 pixels[4] = cm[pixels[4] + block[4]];
369 pixels[5] = cm[pixels[5] + block[5]];
370 pixels[6] = cm[pixels[6] + block[6]];
371 pixels[7] = cm[pixels[7] + block[7]];
372 pixels += line_size;
373 block += 8;
de6d9b64
FB
374 }
375}
59fe111e
MN
376#if 0
377
378#define PIXOP2(OPNAME, OP) \
b3184779 379static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
380{\
381 int i;\
382 for(i=0; i<h; i++){\
383 OP(*((uint64_t*)block), LD64(pixels));\
384 pixels+=line_size;\
385 block +=line_size;\
386 }\
387}\
388\
45553457 389static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
390{\
391 int i;\
392 for(i=0; i<h; i++){\
393 const uint64_t a= LD64(pixels );\
394 const uint64_t b= LD64(pixels+1);\
395 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
396 pixels+=line_size;\
397 block +=line_size;\
398 }\
399}\
400\
45553457 401static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
402{\
403 int i;\
404 for(i=0; i<h; i++){\
405 const uint64_t a= LD64(pixels );\
406 const uint64_t b= LD64(pixels+1);\
407 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
408 pixels+=line_size;\
409 block +=line_size;\
410 }\
411}\
412\
45553457 413static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
414{\
415 int i;\
416 for(i=0; i<h; i++){\
417 const uint64_t a= LD64(pixels );\
418 const uint64_t b= LD64(pixels+line_size);\
419 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
420 pixels+=line_size;\
421 block +=line_size;\
422 }\
423}\
424\
45553457 425static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
426{\
427 int i;\
428 for(i=0; i<h; i++){\
429 const uint64_t a= LD64(pixels );\
430 const uint64_t b= LD64(pixels+line_size);\
431 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
432 pixels+=line_size;\
433 block +=line_size;\
434 }\
435}\
436\
45553457 437static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
438{\
439 int i;\
440 const uint64_t a= LD64(pixels );\
441 const uint64_t b= LD64(pixels+1);\
442 uint64_t l0= (a&0x0303030303030303ULL)\
443 + (b&0x0303030303030303ULL)\
444 + 0x0202020202020202ULL;\
445 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
446 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
447 uint64_t l1,h1;\
448\
449 pixels+=line_size;\
450 for(i=0; i<h; i+=2){\
451 uint64_t a= LD64(pixels );\
452 uint64_t b= LD64(pixels+1);\
453 l1= (a&0x0303030303030303ULL)\
454 + (b&0x0303030303030303ULL);\
455 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
456 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
457 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
458 pixels+=line_size;\
459 block +=line_size;\
460 a= LD64(pixels );\
461 b= LD64(pixels+1);\
462 l0= (a&0x0303030303030303ULL)\
463 + (b&0x0303030303030303ULL)\
464 + 0x0202020202020202ULL;\
465 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
466 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
467 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
468 pixels+=line_size;\
469 block +=line_size;\
470 }\
471}\
472\
45553457 473static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
474{\
475 int i;\
476 const uint64_t a= LD64(pixels );\
477 const uint64_t b= LD64(pixels+1);\
478 uint64_t l0= (a&0x0303030303030303ULL)\
479 + (b&0x0303030303030303ULL)\
480 + 0x0101010101010101ULL;\
481 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
482 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
483 uint64_t l1,h1;\
484\
485 pixels+=line_size;\
486 for(i=0; i<h; i+=2){\
487 uint64_t a= LD64(pixels );\
488 uint64_t b= LD64(pixels+1);\
489 l1= (a&0x0303030303030303ULL)\
490 + (b&0x0303030303030303ULL);\
491 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
492 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
493 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
494 pixels+=line_size;\
495 block +=line_size;\
496 a= LD64(pixels );\
497 b= LD64(pixels+1);\
498 l0= (a&0x0303030303030303ULL)\
499 + (b&0x0303030303030303ULL)\
500 + 0x0101010101010101ULL;\
501 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
502 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
503 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
504 pixels+=line_size;\
505 block +=line_size;\
506 }\
507}\
508\
45553457
ZK
509CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
510CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
511CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
512CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
513CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
514CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
515CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
516
517#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
518#else // 64 bit variant
519
520#define PIXOP2(OPNAME, OP) \
669ac79c
MN
521static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
522 int i;\
523 for(i=0; i<h; i++){\
524 OP(*((uint16_t*)(block )), LD16(pixels ));\
525 pixels+=line_size;\
526 block +=line_size;\
527 }\
528}\
0da71265
MN
529static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
530 int i;\
531 for(i=0; i<h; i++){\
532 OP(*((uint32_t*)(block )), LD32(pixels ));\
533 pixels+=line_size;\
534 block +=line_size;\
535 }\
536}\
45553457 537static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
538 int i;\
539 for(i=0; i<h; i++){\
540 OP(*((uint32_t*)(block )), LD32(pixels ));\
541 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
542 pixels+=line_size;\
543 block +=line_size;\
544 }\
545}\
45553457
ZK
546static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
547 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 548}\
59fe111e 549\
b3184779
MN
550static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
551 int src_stride1, int src_stride2, int h){\
59fe111e
MN
552 int i;\
553 for(i=0; i<h; i++){\
b3184779
MN
554 uint32_t a,b;\
555 a= LD32(&src1[i*src_stride1 ]);\
556 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 557 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
b3184779
MN
558 a= LD32(&src1[i*src_stride1+4]);\
559 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 560 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
59fe111e
MN
561 }\
562}\
563\
b3184779
MN
564static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
565 int src_stride1, int src_stride2, int h){\
59fe111e
MN
566 int i;\
567 for(i=0; i<h; i++){\
b3184779
MN
568 uint32_t a,b;\
569 a= LD32(&src1[i*src_stride1 ]);\
570 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 571 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
b3184779
MN
572 a= LD32(&src1[i*src_stride1+4]);\
573 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 574 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
59fe111e
MN
575 }\
576}\
577\
0da71265
MN
578static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579 int src_stride1, int src_stride2, int h){\
580 int i;\
581 for(i=0; i<h; i++){\
582 uint32_t a,b;\
583 a= LD32(&src1[i*src_stride1 ]);\
584 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 585 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
0da71265
MN
586 }\
587}\
588\
669ac79c
MN
589static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
590 int src_stride1, int src_stride2, int h){\
591 int i;\
592 for(i=0; i<h; i++){\
593 uint32_t a,b;\
594 a= LD16(&src1[i*src_stride1 ]);\
595 b= LD16(&src2[i*src_stride2 ]);\
596 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
597 }\
598}\
599\
b3184779
MN
600static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
601 int src_stride1, int src_stride2, int h){\
602 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
603 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
604}\
605\
606static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
607 int src_stride1, int src_stride2, int h){\
608 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
609 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
610}\
611\
45553457 612static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
613 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
614}\
615\
45553457 616static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
617 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
618}\
619\
45553457 620static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
621 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
622}\
623\
45553457 624static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
625 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
626}\
627\
628static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
629 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
630 int i;\
631 for(i=0; i<h; i++){\
b3184779
MN
632 uint32_t a, b, c, d, l0, l1, h0, h1;\
633 a= LD32(&src1[i*src_stride1]);\
634 b= LD32(&src2[i*src_stride2]);\
635 c= LD32(&src3[i*src_stride3]);\
636 d= LD32(&src4[i*src_stride4]);\
637 l0= (a&0x03030303UL)\
638 + (b&0x03030303UL)\
639 + 0x02020202UL;\
640 h0= ((a&0xFCFCFCFCUL)>>2)\
641 + ((b&0xFCFCFCFCUL)>>2);\
642 l1= (c&0x03030303UL)\
643 + (d&0x03030303UL);\
644 h1= ((c&0xFCFCFCFCUL)>>2)\
645 + ((d&0xFCFCFCFCUL)>>2);\
646 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
647 a= LD32(&src1[i*src_stride1+4]);\
648 b= LD32(&src2[i*src_stride2+4]);\
649 c= LD32(&src3[i*src_stride3+4]);\
650 d= LD32(&src4[i*src_stride4+4]);\
651 l0= (a&0x03030303UL)\
652 + (b&0x03030303UL)\
653 + 0x02020202UL;\
654 h0= ((a&0xFCFCFCFCUL)>>2)\
655 + ((b&0xFCFCFCFCUL)>>2);\
656 l1= (c&0x03030303UL)\
657 + (d&0x03030303UL);\
658 h1= ((c&0xFCFCFCFCUL)>>2)\
659 + ((d&0xFCFCFCFCUL)>>2);\
660 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
661 }\
662}\
669ac79c
MN
663\
664static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
665 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
666}\
667\
668static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
669 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
670}\
671\
672static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
673 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
674}\
675\
676static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
677 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
678}\
679\
b3184779
MN
680static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
682 int i;\
683 for(i=0; i<h; i++){\
b3184779
MN
684 uint32_t a, b, c, d, l0, l1, h0, h1;\
685 a= LD32(&src1[i*src_stride1]);\
686 b= LD32(&src2[i*src_stride2]);\
687 c= LD32(&src3[i*src_stride3]);\
688 d= LD32(&src4[i*src_stride4]);\
689 l0= (a&0x03030303UL)\
690 + (b&0x03030303UL)\
691 + 0x01010101UL;\
692 h0= ((a&0xFCFCFCFCUL)>>2)\
693 + ((b&0xFCFCFCFCUL)>>2);\
694 l1= (c&0x03030303UL)\
695 + (d&0x03030303UL);\
696 h1= ((c&0xFCFCFCFCUL)>>2)\
697 + ((d&0xFCFCFCFCUL)>>2);\
698 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
699 a= LD32(&src1[i*src_stride1+4]);\
700 b= LD32(&src2[i*src_stride2+4]);\
701 c= LD32(&src3[i*src_stride3+4]);\
702 d= LD32(&src4[i*src_stride4+4]);\
703 l0= (a&0x03030303UL)\
704 + (b&0x03030303UL)\
705 + 0x01010101UL;\
706 h0= ((a&0xFCFCFCFCUL)>>2)\
707 + ((b&0xFCFCFCFCUL)>>2);\
708 l1= (c&0x03030303UL)\
709 + (d&0x03030303UL);\
710 h1= ((c&0xFCFCFCFCUL)>>2)\
711 + ((d&0xFCFCFCFCUL)>>2);\
712 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
713 }\
714}\
b3184779
MN
715static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
716 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
717 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
718 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
719}\
720static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
721 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
722 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
723 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
724}\
59fe111e 725\
669ac79c
MN
726static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
727{\
728 int i, a0, b0, a1, b1;\
729 a0= pixels[0];\
730 b0= pixels[1] + 2;\
731 a0 += b0;\
732 b0 += pixels[2];\
733\
734 pixels+=line_size;\
735 for(i=0; i<h; i+=2){\
736 a1= pixels[0];\
737 b1= pixels[1];\
738 a1 += b1;\
739 b1 += pixels[2];\
740\
741 block[0]= (a1+a0)>>2; /* FIXME non put */\
742 block[1]= (b1+b0)>>2;\
743\
744 pixels+=line_size;\
745 block +=line_size;\
746\
747 a0= pixels[0];\
748 b0= pixels[1] + 2;\
749 a0 += b0;\
750 b0 += pixels[2];\
751\
752 block[0]= (a1+a0)>>2;\
753 block[1]= (b1+b0)>>2;\
754 pixels+=line_size;\
755 block +=line_size;\
756 }\
757}\
758\
759static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
760{\
761 int i;\
762 const uint32_t a= LD32(pixels );\
763 const uint32_t b= LD32(pixels+1);\
764 uint32_t l0= (a&0x03030303UL)\
765 + (b&0x03030303UL)\
766 + 0x02020202UL;\
767 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
768 + ((b&0xFCFCFCFCUL)>>2);\
769 uint32_t l1,h1;\
770\
771 pixels+=line_size;\
772 for(i=0; i<h; i+=2){\
773 uint32_t a= LD32(pixels );\
774 uint32_t b= LD32(pixels+1);\
775 l1= (a&0x03030303UL)\
776 + (b&0x03030303UL);\
777 h1= ((a&0xFCFCFCFCUL)>>2)\
778 + ((b&0xFCFCFCFCUL)>>2);\
779 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
780 pixels+=line_size;\
781 block +=line_size;\
782 a= LD32(pixels );\
783 b= LD32(pixels+1);\
784 l0= (a&0x03030303UL)\
785 + (b&0x03030303UL)\
786 + 0x02020202UL;\
787 h0= ((a&0xFCFCFCFCUL)>>2)\
788 + ((b&0xFCFCFCFCUL)>>2);\
789 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
790 pixels+=line_size;\
791 block +=line_size;\
792 }\
793}\
794\
45553457 795static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
796{\
797 int j;\
798 for(j=0; j<2; j++){\
799 int i;\
800 const uint32_t a= LD32(pixels );\
801 const uint32_t b= LD32(pixels+1);\
802 uint32_t l0= (a&0x03030303UL)\
803 + (b&0x03030303UL)\
804 + 0x02020202UL;\
805 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
806 + ((b&0xFCFCFCFCUL)>>2);\
807 uint32_t l1,h1;\
808\
809 pixels+=line_size;\
810 for(i=0; i<h; i+=2){\
811 uint32_t a= LD32(pixels );\
812 uint32_t b= LD32(pixels+1);\
813 l1= (a&0x03030303UL)\
814 + (b&0x03030303UL);\
815 h1= ((a&0xFCFCFCFCUL)>>2)\
816 + ((b&0xFCFCFCFCUL)>>2);\
817 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
818 pixels+=line_size;\
819 block +=line_size;\
820 a= LD32(pixels );\
821 b= LD32(pixels+1);\
822 l0= (a&0x03030303UL)\
823 + (b&0x03030303UL)\
824 + 0x02020202UL;\
825 h0= ((a&0xFCFCFCFCUL)>>2)\
826 + ((b&0xFCFCFCFCUL)>>2);\
827 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
828 pixels+=line_size;\
829 block +=line_size;\
830 }\
831 pixels+=4-line_size*(h+1);\
832 block +=4-line_size*h;\
833 }\
834}\
835\
45553457 836static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
837{\
838 int j;\
839 for(j=0; j<2; j++){\
840 int i;\
841 const uint32_t a= LD32(pixels );\
842 const uint32_t b= LD32(pixels+1);\
843 uint32_t l0= (a&0x03030303UL)\
844 + (b&0x03030303UL)\
845 + 0x01010101UL;\
846 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
847 + ((b&0xFCFCFCFCUL)>>2);\
848 uint32_t l1,h1;\
849\
850 pixels+=line_size;\
851 for(i=0; i<h; i+=2){\
852 uint32_t a= LD32(pixels );\
853 uint32_t b= LD32(pixels+1);\
854 l1= (a&0x03030303UL)\
855 + (b&0x03030303UL);\
856 h1= ((a&0xFCFCFCFCUL)>>2)\
857 + ((b&0xFCFCFCFCUL)>>2);\
858 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
859 pixels+=line_size;\
860 block +=line_size;\
861 a= LD32(pixels );\
862 b= LD32(pixels+1);\
863 l0= (a&0x03030303UL)\
864 + (b&0x03030303UL)\
865 + 0x01010101UL;\
866 h0= ((a&0xFCFCFCFCUL)>>2)\
867 + ((b&0xFCFCFCFCUL)>>2);\
868 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
869 pixels+=line_size;\
870 block +=line_size;\
871 }\
872 pixels+=4-line_size*(h+1);\
873 block +=4-line_size*h;\
874 }\
875}\
876\
45553457
ZK
877CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
878CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
879CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
880CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
881CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
882CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
883CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
884CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 885
d8085ea7 886#define op_avg(a, b) a = rnd_avg32(a, b)
59fe111e 887#endif
59fe111e
MN
888#define op_put(a, b) a = b
889
890PIXOP2(avg, op_avg)
891PIXOP2(put, op_put)
892#undef op_avg
893#undef op_put
894
de6d9b64
FB
895#define avg2(a,b) ((a+b+1)>>1)
896#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
897
c0a0170c
MN
898static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
899 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
900}
901
902static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
903 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
904}
073b013d 905
0c1a9eda 906static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
907{
908 const int A=(16-x16)*(16-y16);
909 const int B=( x16)*(16-y16);
910 const int C=(16-x16)*( y16);
911 const int D=( x16)*( y16);
912 int i;
44eb4951
MN
913
914 for(i=0; i<h; i++)
915 {
b3184779
MN
916 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
917 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
918 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
919 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
920 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
921 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
922 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
923 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
924 dst+= stride;
925 src+= stride;
44eb4951
MN
926 }
927}
928
0c1a9eda 929static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
930 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
931{
932 int y, vx, vy;
933 const int s= 1<<shift;
934
935 width--;
936 height--;
937
938 for(y=0; y<h; y++){
939 int x;
940
941 vx= ox;
942 vy= oy;
943 for(x=0; x<8; x++){ //XXX FIXME optimize
944 int src_x, src_y, frac_x, frac_y, index;
945
946 src_x= vx>>16;
947 src_y= vy>>16;
948 frac_x= src_x&(s-1);
949 frac_y= src_y&(s-1);
950 src_x>>=shift;
951 src_y>>=shift;
952
953 if((unsigned)src_x < width){
954 if((unsigned)src_y < height){
955 index= src_x + src_y*stride;
956 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
957 + src[index +1]* frac_x )*(s-frac_y)
958 + ( src[index+stride ]*(s-frac_x)
959 + src[index+stride+1]* frac_x )* frac_y
960 + r)>>(shift*2);
961 }else{
962 index= src_x + clip(src_y, 0, height)*stride;
963 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
964 + src[index +1]* frac_x )*s
965 + r)>>(shift*2);
966 }
967 }else{
968 if((unsigned)src_y < height){
969 index= clip(src_x, 0, width) + src_y*stride;
970 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
971 + src[index+stride ]* frac_y )*s
972 + r)>>(shift*2);
973 }else{
974 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
975 dst[y*stride + x]= src[index ];
976 }
977 }
978
979 vx+= dxx;
980 vy+= dyx;
981 }
982 ox += dxy;
983 oy += dyy;
984 }
985}
669ac79c
MN
986
987static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
988 switch(width){
989 case 2: put_pixels2_c (dst, src, stride, height); break;
990 case 4: put_pixels4_c (dst, src, stride, height); break;
991 case 8: put_pixels8_c (dst, src, stride, height); break;
992 case 16:put_pixels16_c(dst, src, stride, height); break;
993 }
994}
995
996static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
997 int i,j;
998 for (i=0; i < height; i++) {
999 for (j=0; j < width; j++) {
1000 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1001 }
1002 src += stride;
1003 dst += stride;
1004 }
1005}
1006
1007static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1008 int i,j;
1009 for (i=0; i < height; i++) {
1010 for (j=0; j < width; j++) {
1011 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1012 }
1013 src += stride;
1014 dst += stride;
1015 }
1016}
1017
1018static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1019 int i,j;
1020 for (i=0; i < height; i++) {
1021 for (j=0; j < width; j++) {
1022 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1023 }
1024 src += stride;
1025 dst += stride;
1026 }
1027}
1028
1029static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1030 int i,j;
1031 for (i=0; i < height; i++) {
1032 for (j=0; j < width; j++) {
1033 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1034 }
1035 src += stride;
1036 dst += stride;
1037 }
1038}
1039
1040static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1041 int i,j;
1042 for (i=0; i < height; i++) {
1043 for (j=0; j < width; j++) {
89ebf4e8 1044 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1045 }
1046 src += stride;
1047 dst += stride;
1048 }
1049}
1050
1051static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1052 int i,j;
1053 for (i=0; i < height; i++) {
1054 for (j=0; j < width; j++) {
1055 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1056 }
1057 src += stride;
1058 dst += stride;
1059 }
1060}
1061
1062static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1063 int i,j;
1064 for (i=0; i < height; i++) {
1065 for (j=0; j < width; j++) {
89ebf4e8 1066 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1067 }
1068 src += stride;
1069 dst += stride;
1070 }
1071}
1072
1073static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1074 int i,j;
1075 for (i=0; i < height; i++) {
1076 for (j=0; j < width; j++) {
1077 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1078 }
1079 src += stride;
1080 dst += stride;
1081 }
1082}
da3b9756
MM
1083
1084static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1085 switch(width){
1086 case 2: avg_pixels2_c (dst, src, stride, height); break;
1087 case 4: avg_pixels4_c (dst, src, stride, height); break;
1088 case 8: avg_pixels8_c (dst, src, stride, height); break;
1089 case 16:avg_pixels16_c(dst, src, stride, height); break;
1090 }
1091}
1092
1093static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1094 int i,j;
1095 for (i=0; i < height; i++) {
1096 for (j=0; j < width; j++) {
1097 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1098 }
1099 src += stride;
1100 dst += stride;
1101 }
1102}
1103
1104static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1105 int i,j;
1106 for (i=0; i < height; i++) {
1107 for (j=0; j < width; j++) {
1108 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1109 }
1110 src += stride;
1111 dst += stride;
1112 }
1113}
1114
1115static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1116 int i,j;
1117 for (i=0; i < height; i++) {
1118 for (j=0; j < width; j++) {
1119 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1120 }
1121 src += stride;
1122 dst += stride;
1123 }
1124}
1125
1126static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1127 int i,j;
1128 for (i=0; i < height; i++) {
1129 for (j=0; j < width; j++) {
1130 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1131 }
1132 src += stride;
1133 dst += stride;
1134 }
1135}
1136
1137static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1138 int i,j;
1139 for (i=0; i < height; i++) {
1140 for (j=0; j < width; j++) {
89ebf4e8 1141 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1142 }
1143 src += stride;
1144 dst += stride;
1145 }
1146}
1147
1148static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1149 int i,j;
1150 for (i=0; i < height; i++) {
1151 for (j=0; j < width; j++) {
1152 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1153 }
1154 src += stride;
1155 dst += stride;
1156 }
1157}
1158
1159static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1160 int i,j;
1161 for (i=0; i < height; i++) {
1162 for (j=0; j < width; j++) {
89ebf4e8 1163 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1164 }
1165 src += stride;
1166 dst += stride;
1167 }
1168}
1169
1170static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1171 int i,j;
1172 for (i=0; i < height; i++) {
1173 for (j=0; j < width; j++) {
1174 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1175 }
1176 src += stride;
1177 dst += stride;
1178 }
1179}
669ac79c
MN
1180#if 0
1181#define TPEL_WIDTH(width)\
1182static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1183 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1184static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1185 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1186static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1187 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1188static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1189 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1190static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1191 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1192static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1193 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1194static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1195 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1196static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1197 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1198static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1199 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1200#endif
1201
0da71265
MN
1202#define H264_CHROMA_MC(OPNAME, OP)\
1203static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1204 const int A=(8-x)*(8-y);\
1205 const int B=( x)*(8-y);\
1206 const int C=(8-x)*( y);\
1207 const int D=( x)*( y);\
1208 int i;\
1209 \
1210 assert(x<8 && y<8 && x>=0 && y>=0);\
1211\
1212 for(i=0; i<h; i++)\
1213 {\
1214 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1215 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1216 dst+= stride;\
1217 src+= stride;\
1218 }\
1219}\
1220\
1221static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1222 const int A=(8-x)*(8-y);\
1223 const int B=( x)*(8-y);\
1224 const int C=(8-x)*( y);\
1225 const int D=( x)*( y);\
1226 int i;\
1227 \
1228 assert(x<8 && y<8 && x>=0 && y>=0);\
1229\
1230 for(i=0; i<h; i++)\
1231 {\
1232 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1233 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1234 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1235 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1236 dst+= stride;\
1237 src+= stride;\
1238 }\
1239}\
1240\
1241static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1242 const int A=(8-x)*(8-y);\
1243 const int B=( x)*(8-y);\
1244 const int C=(8-x)*( y);\
1245 const int D=( x)*( y);\
1246 int i;\
1247 \
1248 assert(x<8 && y<8 && x>=0 && y>=0);\
1249\
1250 for(i=0; i<h; i++)\
1251 {\
1252 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1253 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1254 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1255 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1256 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1257 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1258 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1259 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1260 dst+= stride;\
1261 src+= stride;\
1262 }\
1263}
1264
1265#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1266#define op_put(a, b) a = (((b) + 32)>>6)
1267
1268H264_CHROMA_MC(put_ , op_put)
1269H264_CHROMA_MC(avg_ , op_avg)
1270#undef op_avg
1271#undef op_put
1272
1273static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1274{
1275 int i;
1276 for(i=0; i<h; i++)
1277 {
1278 ST32(dst , LD32(src ));
1279 dst+=dstStride;
1280 src+=srcStride;
1281 }
1282}
1283
1284static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1285{
1286 int i;
1287 for(i=0; i<h; i++)
1288 {
1289 ST32(dst , LD32(src ));
1290 ST32(dst+4 , LD32(src+4 ));
1291 dst+=dstStride;
1292 src+=srcStride;
1293 }
1294}
1295
1296static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1297{
1298 int i;
1299 for(i=0; i<h; i++)
1300 {
1301 ST32(dst , LD32(src ));
1302 ST32(dst+4 , LD32(src+4 ));
1303 ST32(dst+8 , LD32(src+8 ));
1304 ST32(dst+12, LD32(src+12));
1305 dst+=dstStride;
1306 src+=srcStride;
1307 }
1308}
073b013d 1309
0c1a9eda 1310static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951 1311{
44eb4951
MN
1312 int i;
1313 for(i=0; i<h; i++)
1314 {
b3184779
MN
1315 ST32(dst , LD32(src ));
1316 ST32(dst+4 , LD32(src+4 ));
1317 ST32(dst+8 , LD32(src+8 ));
1318 ST32(dst+12, LD32(src+12));
1319 dst[16]= src[16];
44eb4951
MN
1320 dst+=dstStride;
1321 src+=srcStride;
1322 }
1323}
1324
0c1a9eda 1325static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951
MN
1326{
1327 int i;
b3184779 1328 for(i=0; i<h; i++)
44eb4951 1329 {
b3184779
MN
1330 ST32(dst , LD32(src ));
1331 ST32(dst+4 , LD32(src+4 ));
1332 dst[8]= src[8];
44eb4951
MN
1333 dst+=dstStride;
1334 src+=srcStride;
1335 }
1336}
1337
826f429a 1338
b3184779 1339#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda
ZK
1340static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1341 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1342 int i;\
1343 for(i=0; i<h; i++)\
1344 {\
1345 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1346 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1347 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1348 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1349 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1350 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1351 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1352 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1353 dst+=dstStride;\
1354 src+=srcStride;\
1355 }\
44eb4951
MN
1356}\
1357\
0c1a9eda 1358static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 1359 const int w=8;\
0c1a9eda 1360 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1361 int i;\
1362 for(i=0; i<w; i++)\
1363 {\
1364 const int src0= src[0*srcStride];\
1365 const int src1= src[1*srcStride];\
1366 const int src2= src[2*srcStride];\
1367 const int src3= src[3*srcStride];\
1368 const int src4= src[4*srcStride];\
1369 const int src5= src[5*srcStride];\
1370 const int src6= src[6*srcStride];\
1371 const int src7= src[7*srcStride];\
1372 const int src8= src[8*srcStride];\
1373 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1374 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1375 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1376 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1377 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1378 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1379 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1380 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1381 dst++;\
1382 src++;\
1383 }\
1384}\
1385\
0c1a9eda
ZK
1386static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1387 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1388 int i;\
826f429a 1389 \
b3184779
MN
1390 for(i=0; i<h; i++)\
1391 {\
1392 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1393 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1394 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1395 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1396 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1397 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1398 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1399 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1400 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1401 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1402 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1403 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1404 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1405 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1406 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1407 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1408 dst+=dstStride;\
1409 src+=srcStride;\
1410 }\
1411}\
1412\
0c1a9eda
ZK
1413static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1414 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1415 int i;\
826f429a 1416 const int w=16;\
b3184779
MN
1417 for(i=0; i<w; i++)\
1418 {\
1419 const int src0= src[0*srcStride];\
1420 const int src1= src[1*srcStride];\
1421 const int src2= src[2*srcStride];\
1422 const int src3= src[3*srcStride];\
1423 const int src4= src[4*srcStride];\
1424 const int src5= src[5*srcStride];\
1425 const int src6= src[6*srcStride];\
1426 const int src7= src[7*srcStride];\
1427 const int src8= src[8*srcStride];\
1428 const int src9= src[9*srcStride];\
1429 const int src10= src[10*srcStride];\
1430 const int src11= src[11*srcStride];\
1431 const int src12= src[12*srcStride];\
1432 const int src13= src[13*srcStride];\
1433 const int src14= src[14*srcStride];\
1434 const int src15= src[15*srcStride];\
1435 const int src16= src[16*srcStride];\
1436 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1437 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1438 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1439 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1440 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1441 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1442 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1443 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1444 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1445 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1446 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1447 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1448 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1449 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1450 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1451 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1452 dst++;\
1453 src++;\
1454 }\
1455}\
1456\
0c1a9eda 1457static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1458 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1459}\
1460\
0c1a9eda
ZK
1461static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1462 uint8_t half[64];\
b3184779
MN
1463 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1464 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1465}\
1466\
0c1a9eda 1467static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1468 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1469}\
1470\
0c1a9eda
ZK
1471static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1472 uint8_t half[64];\
b3184779
MN
1473 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1474 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1475}\
1476\
0c1a9eda
ZK
1477static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1478 uint8_t full[16*9];\
1479 uint8_t half[64];\
b3184779 1480 copy_block9(full, src, 16, stride, 9);\
db794953 1481 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1482 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1483}\
1484\
0c1a9eda
ZK
1485static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1486 uint8_t full[16*9];\
b3184779 1487 copy_block9(full, src, 16, stride, 9);\
db794953 1488 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1489}\
1490\
0c1a9eda
ZK
1491static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1492 uint8_t full[16*9];\
1493 uint8_t half[64];\
b3184779 1494 copy_block9(full, src, 16, stride, 9);\
db794953 1495 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1496 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1497}\
0c1a9eda
ZK
1498void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[16*9];\
1500 uint8_t halfH[72];\
1501 uint8_t halfV[64];\
1502 uint8_t halfHV[64];\
b3184779
MN
1503 copy_block9(full, src, 16, stride, 9);\
1504 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1505 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1507 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1508}\
0c1a9eda
ZK
1509static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1510 uint8_t full[16*9];\
1511 uint8_t halfH[72];\
1512 uint8_t halfHV[64];\
db794953
MN
1513 copy_block9(full, src, 16, stride, 9);\
1514 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1518}\
0c1a9eda
ZK
1519void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520 uint8_t full[16*9];\
1521 uint8_t halfH[72];\
1522 uint8_t halfV[64];\
1523 uint8_t halfHV[64];\
b3184779
MN
1524 copy_block9(full, src, 16, stride, 9);\
1525 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1526 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1528 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1529}\
0c1a9eda
ZK
1530static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1531 uint8_t full[16*9];\
1532 uint8_t halfH[72];\
1533 uint8_t halfHV[64];\
db794953
MN
1534 copy_block9(full, src, 16, stride, 9);\
1535 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1539}\
0c1a9eda
ZK
1540void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541 uint8_t full[16*9];\
1542 uint8_t halfH[72];\
1543 uint8_t halfV[64];\
1544 uint8_t halfHV[64];\
b3184779
MN
1545 copy_block9(full, src, 16, stride, 9);\
1546 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1547 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1548 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1549 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1550}\
0c1a9eda
ZK
1551static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1552 uint8_t full[16*9];\
1553 uint8_t halfH[72];\
1554 uint8_t halfHV[64];\
db794953
MN
1555 copy_block9(full, src, 16, stride, 9);\
1556 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1558 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1560}\
0c1a9eda
ZK
1561void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1562 uint8_t full[16*9];\
1563 uint8_t halfH[72];\
1564 uint8_t halfV[64];\
1565 uint8_t halfHV[64];\
b3184779
MN
1566 copy_block9(full, src, 16, stride, 9);\
1567 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1568 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1569 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1570 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1571}\
0c1a9eda
ZK
1572static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1573 uint8_t full[16*9];\
1574 uint8_t halfH[72];\
1575 uint8_t halfHV[64];\
db794953
MN
1576 copy_block9(full, src, 16, stride, 9);\
1577 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1578 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1579 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1580 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1581}\
0c1a9eda
ZK
1582static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1583 uint8_t halfH[72];\
1584 uint8_t halfHV[64];\
b3184779 1585 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1586 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1587 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1588}\
0c1a9eda
ZK
1589static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1590 uint8_t halfH[72];\
1591 uint8_t halfHV[64];\
b3184779 1592 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1593 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1594 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1595}\
0c1a9eda
ZK
1596void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1597 uint8_t full[16*9];\
1598 uint8_t halfH[72];\
1599 uint8_t halfV[64];\
1600 uint8_t halfHV[64];\
b3184779
MN
1601 copy_block9(full, src, 16, stride, 9);\
1602 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1603 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1604 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1605 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1606}\
0c1a9eda
ZK
1607static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1608 uint8_t full[16*9];\
1609 uint8_t halfH[72];\
db794953
MN
1610 copy_block9(full, src, 16, stride, 9);\
1611 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1612 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1613 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1614}\
0c1a9eda
ZK
1615void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1616 uint8_t full[16*9];\
1617 uint8_t halfH[72];\
1618 uint8_t halfV[64];\
1619 uint8_t halfHV[64];\
b3184779
MN
1620 copy_block9(full, src, 16, stride, 9);\
1621 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1622 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1623 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1624 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1625}\
0c1a9eda
ZK
1626static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1627 uint8_t full[16*9];\
1628 uint8_t halfH[72];\
db794953
MN
1629 copy_block9(full, src, 16, stride, 9);\
1630 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1631 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1632 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1633}\
0c1a9eda
ZK
1634static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1635 uint8_t halfH[72];\
b3184779 1636 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1637 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 1638}\
0c1a9eda 1639static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1640 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1641}\
1642\
0c1a9eda
ZK
1643static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1644 uint8_t half[256];\
b3184779
MN
1645 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1646 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1647}\
1648\
0c1a9eda 1649static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1650 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1651}\
b3184779 1652\
0c1a9eda
ZK
1653static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1654 uint8_t half[256];\
b3184779
MN
1655 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1656 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1657}\
1658\
0c1a9eda
ZK
1659static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1660 uint8_t full[24*17];\
1661 uint8_t half[256];\
b3184779 1662 copy_block17(full, src, 24, stride, 17);\
826f429a 1663 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1664 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1665}\
1666\
0c1a9eda
ZK
1667static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1668 uint8_t full[24*17];\
b3184779 1669 copy_block17(full, src, 24, stride, 17);\
826f429a 1670 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1671}\
1672\
0c1a9eda
ZK
1673static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1674 uint8_t full[24*17];\
1675 uint8_t half[256];\
b3184779 1676 copy_block17(full, src, 24, stride, 17);\
826f429a 1677 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1678 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1679}\
0c1a9eda
ZK
1680void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[24*17];\
1682 uint8_t halfH[272];\
1683 uint8_t halfV[256];\
1684 uint8_t halfHV[256];\
b3184779
MN
1685 copy_block17(full, src, 24, stride, 17);\
1686 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1687 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1689 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1690}\
0c1a9eda
ZK
1691static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1692 uint8_t full[24*17];\
1693 uint8_t halfH[272];\
1694 uint8_t halfHV[256];\
db794953
MN
1695 copy_block17(full, src, 24, stride, 17);\
1696 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1700}\
0c1a9eda
ZK
1701void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702 uint8_t full[24*17];\
1703 uint8_t halfH[272];\
1704 uint8_t halfV[256];\
1705 uint8_t halfHV[256];\
b3184779
MN
1706 copy_block17(full, src, 24, stride, 17);\
1707 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1708 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1710 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1711}\
0c1a9eda
ZK
1712static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[24*17];\
1714 uint8_t halfH[272];\
1715 uint8_t halfHV[256];\
db794953
MN
1716 copy_block17(full, src, 24, stride, 17);\
1717 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1721}\
0c1a9eda
ZK
1722void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723 uint8_t full[24*17];\
1724 uint8_t halfH[272];\
1725 uint8_t halfV[256];\
1726 uint8_t halfHV[256];\
b3184779
MN
1727 copy_block17(full, src, 24, stride, 17);\
1728 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1729 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1730 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1731 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1732}\
0c1a9eda
ZK
1733static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1734 uint8_t full[24*17];\
1735 uint8_t halfH[272];\
1736 uint8_t halfHV[256];\
db794953
MN
1737 copy_block17(full, src, 24, stride, 17);\
1738 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1740 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1742}\
0c1a9eda
ZK
1743void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t full[24*17];\
1745 uint8_t halfH[272];\
1746 uint8_t halfV[256];\
1747 uint8_t halfHV[256];\
b3184779
MN
1748 copy_block17(full, src, 24, stride, 17);\
1749 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1750 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1751 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1752 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1753}\
0c1a9eda
ZK
1754static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1755 uint8_t full[24*17];\
1756 uint8_t halfH[272];\
1757 uint8_t halfHV[256];\
db794953
MN
1758 copy_block17(full, src, 24, stride, 17);\
1759 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1760 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1761 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1762 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1763}\
0c1a9eda
ZK
1764static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1765 uint8_t halfH[272];\
1766 uint8_t halfHV[256];\
b3184779 1767 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1768 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1769 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1770}\
0c1a9eda
ZK
1771static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1772 uint8_t halfH[272];\
1773 uint8_t halfHV[256];\
b3184779 1774 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1775 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1776 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1777}\
0c1a9eda
ZK
1778void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t full[24*17];\
1780 uint8_t halfH[272];\
1781 uint8_t halfV[256];\
1782 uint8_t halfHV[256];\
b3184779
MN
1783 copy_block17(full, src, 24, stride, 17);\
1784 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1785 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1786 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1787 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1788}\
0c1a9eda
ZK
1789static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1790 uint8_t full[24*17];\
1791 uint8_t halfH[272];\
db794953
MN
1792 copy_block17(full, src, 24, stride, 17);\
1793 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1794 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1795 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1796}\
0c1a9eda
ZK
1797void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1798 uint8_t full[24*17];\
1799 uint8_t halfH[272];\
1800 uint8_t halfV[256];\
1801 uint8_t halfHV[256];\
b3184779
MN
1802 copy_block17(full, src, 24, stride, 17);\
1803 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1804 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1805 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1806 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1807}\
0c1a9eda
ZK
1808static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1809 uint8_t full[24*17];\
1810 uint8_t halfH[272];\
db794953
MN
1811 copy_block17(full, src, 24, stride, 17);\
1812 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1813 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1814 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1815}\
0c1a9eda
ZK
1816static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1817 uint8_t halfH[272];\
b3184779 1818 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1819 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 1820}
44eb4951 1821
b3184779
MN
1822#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1823#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1824#define op_put(a, b) a = cm[((b) + 16)>>5]
1825#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1826
1827QPEL_MC(0, put_ , _ , op_put)
1828QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1829QPEL_MC(0, avg_ , _ , op_avg)
1830//QPEL_MC(1, avg_no_rnd , _ , op_avg)
1831#undef op_avg
1832#undef op_avg_no_rnd
1833#undef op_put
1834#undef op_put_no_rnd
44eb4951 1835
0da71265
MN
1836#if 1
1837#define H264_LOWPASS(OPNAME, OP, OP2) \
1838static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1839 const int h=4;\
1840 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1841 int i;\
1842 for(i=0; i<h; i++)\
1843 {\
1844 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1845 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1846 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1847 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1848 dst+=dstStride;\
1849 src+=srcStride;\
1850 }\
1851}\
1852\
1853static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1854 const int w=4;\
1855 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1856 int i;\
1857 for(i=0; i<w; i++)\
1858 {\
1859 const int srcB= src[-2*srcStride];\
1860 const int srcA= src[-1*srcStride];\
1861 const int src0= src[0 *srcStride];\
1862 const int src1= src[1 *srcStride];\
1863 const int src2= src[2 *srcStride];\
1864 const int src3= src[3 *srcStride];\
1865 const int src4= src[4 *srcStride];\
1866 const int src5= src[5 *srcStride];\
1867 const int src6= src[6 *srcStride];\
1868 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1869 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1870 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1871 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1872 dst++;\
1873 src++;\
1874 }\
1875}\
1876\
1877static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1878 const int h=4;\
1879 const int w=4;\
1880 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1881 int i;\
1882 src -= 2*srcStride;\
1883 for(i=0; i<h+5; i++)\
1884 {\
1885 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1886 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1887 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1888 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1889 tmp+=tmpStride;\
1890 src+=srcStride;\
1891 }\
1892 tmp -= tmpStride*(h+5-2);\
1893 for(i=0; i<w; i++)\
1894 {\
1895 const int tmpB= tmp[-2*tmpStride];\
1896 const int tmpA= tmp[-1*tmpStride];\
1897 const int tmp0= tmp[0 *tmpStride];\
1898 const int tmp1= tmp[1 *tmpStride];\
1899 const int tmp2= tmp[2 *tmpStride];\
1900 const int tmp3= tmp[3 *tmpStride];\
1901 const int tmp4= tmp[4 *tmpStride];\
1902 const int tmp5= tmp[5 *tmpStride];\
1903 const int tmp6= tmp[6 *tmpStride];\
1904 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1905 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1906 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1907 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1908 dst++;\
1909 tmp++;\
1910 }\
1911}\
1912\
1913static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1914 const int h=8;\
1915 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1916 int i;\
1917 for(i=0; i<h; i++)\
1918 {\
1919 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1920 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1921 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1922 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1923 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1924 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1925 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1926 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1927 dst+=dstStride;\
1928 src+=srcStride;\
1929 }\
1930}\
1931\
1932static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1933 const int w=8;\
1934 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1935 int i;\
1936 for(i=0; i<w; i++)\
1937 {\
1938 const int srcB= src[-2*srcStride];\
1939 const int srcA= src[-1*srcStride];\
1940 const int src0= src[0 *srcStride];\
1941 const int src1= src[1 *srcStride];\
1942 const int src2= src[2 *srcStride];\
1943 const int src3= src[3 *srcStride];\
1944 const int src4= src[4 *srcStride];\
1945 const int src5= src[5 *srcStride];\
1946 const int src6= src[6 *srcStride];\
1947 const int src7= src[7 *srcStride];\
1948 const int src8= src[8 *srcStride];\
1949 const int src9= src[9 *srcStride];\
1950 const int src10=src[10*srcStride];\
1951 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1952 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1953 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1954 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1955 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1956 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1957 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1958 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1959 dst++;\
1960 src++;\
1961 }\
1962}\
1963\
1964static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1965 const int h=8;\
1966 const int w=8;\
1967 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1968 int i;\
1969 src -= 2*srcStride;\
1970 for(i=0; i<h+5; i++)\
1971 {\
1972 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1973 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1974 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1975 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1976 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1977 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1978 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1979 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1980 tmp+=tmpStride;\
1981 src+=srcStride;\
1982 }\
1983 tmp -= tmpStride*(h+5-2);\
1984 for(i=0; i<w; i++)\
1985 {\
1986 const int tmpB= tmp[-2*tmpStride];\
1987 const int tmpA= tmp[-1*tmpStride];\
1988 const int tmp0= tmp[0 *tmpStride];\
1989 const int tmp1= tmp[1 *tmpStride];\
1990 const int tmp2= tmp[2 *tmpStride];\
1991 const int tmp3= tmp[3 *tmpStride];\
1992 const int tmp4= tmp[4 *tmpStride];\
1993 const int tmp5= tmp[5 *tmpStride];\
1994 const int tmp6= tmp[6 *tmpStride];\
1995 const int tmp7= tmp[7 *tmpStride];\
1996 const int tmp8= tmp[8 *tmpStride];\
1997 const int tmp9= tmp[9 *tmpStride];\
1998 const int tmp10=tmp[10*tmpStride];\
1999 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2000 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2001 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2002 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2003 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2004 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2005 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2006 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2007 dst++;\
2008 tmp++;\
2009 }\
2010}\
2011\
2012static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2013 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2014 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2015 src += 8*srcStride;\
2016 dst += 8*dstStride;\
2017 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2018 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2019}\
2020\
2021static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2022 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2023 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2024 src += 8*srcStride;\
2025 dst += 8*dstStride;\
2026 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2027 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2028}\
2029\
2030static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2031 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2032 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2033 src += 8*srcStride;\
2034 tmp += 8*tmpStride;\
2035 dst += 8*dstStride;\
2036 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2037 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2038}\
2039
2040#define H264_MC(OPNAME, SIZE) \
2041static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2042 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2043}\
2044\
2045static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2046 uint8_t half[SIZE*SIZE];\
2047 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2048 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2049}\
2050\
2051static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2052 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2053}\
2054\
2055static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2056 uint8_t half[SIZE*SIZE];\
2057 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2058 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2059}\
2060\
2061static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[SIZE*(SIZE+5)];\
2063 uint8_t * const full_mid= full + SIZE*2;\
2064 uint8_t half[SIZE*SIZE];\
2065 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2066 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2067 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2068}\
2069\
2070static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2071 uint8_t full[SIZE*(SIZE+5)];\
2072 uint8_t * const full_mid= full + SIZE*2;\
2073 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2074 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2075}\
2076\
2077static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2078 uint8_t full[SIZE*(SIZE+5)];\
2079 uint8_t * const full_mid= full + SIZE*2;\
2080 uint8_t half[SIZE*SIZE];\
2081 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2082 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2083 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2084}\
2085\
2086static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2087 uint8_t full[SIZE*(SIZE+5)];\
2088 uint8_t * const full_mid= full + SIZE*2;\
2089 uint8_t halfH[SIZE*SIZE];\
2090 uint8_t halfV[SIZE*SIZE];\
2091 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2092 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2093 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2094 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2095}\
2096\
2097static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2098 uint8_t full[SIZE*(SIZE+5)];\
2099 uint8_t * const full_mid= full + SIZE*2;\
2100 uint8_t halfH[SIZE*SIZE];\
2101 uint8_t halfV[SIZE*SIZE];\
2102 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2103 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2104 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2105 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2106}\
2107\
2108static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2109 uint8_t full[SIZE*(SIZE+5)];\
2110 uint8_t * const full_mid= full + SIZE*2;\
2111 uint8_t halfH[SIZE*SIZE];\
2112 uint8_t halfV[SIZE*SIZE];\
2113 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2114 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2115 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2116 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2117}\
2118\
2119static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2120 uint8_t full[SIZE*(SIZE+5)];\
2121 uint8_t * const full_mid= full + SIZE*2;\
2122 uint8_t halfH[SIZE*SIZE];\
2123 uint8_t halfV[SIZE*SIZE];\
2124 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2125 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2126 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2127 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2128}\
2129\
2130static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2131 int16_t tmp[SIZE*(SIZE+5)];\
2132 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2133}\
2134\
2135static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2136 int16_t tmp[SIZE*(SIZE+5)];\
2137 uint8_t halfH[SIZE*SIZE];\
2138 uint8_t halfHV[SIZE*SIZE];\
2139 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2140 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2142}\
2143\
2144static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2145 int16_t tmp[SIZE*(SIZE+5)];\
2146 uint8_t halfH[SIZE*SIZE];\
2147 uint8_t halfHV[SIZE*SIZE];\
2148 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2149 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2150 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2151}\
2152\
2153static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2154 uint8_t full[SIZE*(SIZE+5)];\
2155 uint8_t * const full_mid= full + SIZE*2;\
2156 int16_t tmp[SIZE*(SIZE+5)];\
2157 uint8_t halfV[SIZE*SIZE];\
2158 uint8_t halfHV[SIZE*SIZE];\
2159 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2160 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2161 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2162 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2163}\
2164\
2165static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2166 uint8_t full[SIZE*(SIZE+5)];\
2167 uint8_t * const full_mid= full + SIZE*2;\
2168 int16_t tmp[SIZE*(SIZE+5)];\
2169 uint8_t halfV[SIZE*SIZE];\
2170 uint8_t halfHV[SIZE*SIZE];\
2171 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2172 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2173 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2174 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2175}\
2176
2177#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2178//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2179#define op_put(a, b) a = cm[((b) + 16)>>5]
2180#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2181#define op2_put(a, b) a = cm[((b) + 512)>>10]
2182
2183H264_LOWPASS(put_ , op_put, op2_put)
2184H264_LOWPASS(avg_ , op_avg, op2_avg)
2185H264_MC(put_, 4)
2186H264_MC(put_, 8)
2187H264_MC(put_, 16)
2188H264_MC(avg_, 4)
2189H264_MC(avg_, 8)
2190H264_MC(avg_, 16)
2191
2192#undef op_avg
2193#undef op_put
2194#undef op2_avg
2195#undef op2_put
2196#endif
2197
1457ab52
MN
2198static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2199 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2200 int i;
2201
2202 for(i=0; i<h; i++){
2203 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2204 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2205 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2206 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2207 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2208 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2209 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2210 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2211 dst+=dstStride;
2212 src+=srcStride;
2213 }
2214}
2215
2216static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2217 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2218 int i;
2219
2220 for(i=0; i<w; i++){
2221 const int src_1= src[ -srcStride];
2222 const int src0 = src[0 ];
2223 const int src1 = src[ srcStride];
2224 const int src2 = src[2*srcStride];
2225 const int src3 = src[3*srcStride];
2226 const int src4 = src[4*srcStride];
2227 const int src5 = src[5*srcStride];
2228 const int src6 = src[6*srcStride];
2229 const int src7 = src[7*srcStride];
2230 const int src8 = src[8*srcStride];
2231 const int src9 = src[9*srcStride];
2232 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2233 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2234 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2235 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2236 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2237 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2238 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2239 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2240 src++;
2241 dst++;
2242 }
2243}
2244
2245static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2246 put_pixels8_c(dst, src, stride, 8);
2247}
2248
2249static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2250 uint8_t half[64];
2251 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2252 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2253}
2254
2255static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2256 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2257}
2258
2259static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2260 uint8_t half[64];
2261 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2262 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2263}
2264
2265static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2266 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2267}
2268
2269static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2270 uint8_t halfH[88];
2271 uint8_t halfV[64];
2272 uint8_t halfHV[64];
2273 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2274 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2275 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2276 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2277}
2278static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2279 uint8_t halfH[88];
2280 uint8_t halfV[64];
2281 uint8_t halfHV[64];
2282 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2283 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2284 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2285 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2286}
2287static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2288 uint8_t halfH[88];
2289 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2290 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2291}
2292
332f9ac4
MN
2293static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2294 int x;
2295 const int strength= ff_h263_loop_filter_strength[qscale];
2296
2297 for(x=0; x<8; x++){
2298 int d1, d2, ad1;
2299 int p0= src[x-2*stride];
2300 int p1= src[x-1*stride];
2301 int p2= src[x+0*stride];
2302 int p3= src[x+1*stride];
2303 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2304
2305 if (d<-2*strength) d1= 0;
2306 else if(d<- strength) d1=-2*strength - d;
2307 else if(d< strength) d1= d;
2308 else if(d< 2*strength) d1= 2*strength - d;
2309 else d1= 0;
2310
2311 p1 += d1;
2312 p2 -= d1;
2313 if(p1&256) p1= ~(p1>>31);
2314 if(p2&256) p2= ~(p2>>31);
2315
2316 src[x-1*stride] = p1;
2317 src[x+0*stride] = p2;
2318
5b5404e3 2319 ad1= ABS(d1)>>1;
332f9ac4
MN
2320
2321 d2= clip((p0-p3)/4, -ad1, ad1);
2322
2323 src[x-2*stride] = p0 - d2;
2324 src[x+ stride] = p3 + d2;
2325 }
2326}
2327
2328static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2329 int y;
2330 const int strength= ff_h263_loop_filter_strength[qscale];
2331
2332 for(y=0; y<8; y++){
2333 int d1, d2, ad1;
2334 int p0= src[y*stride-2];
2335 int p1= src[y*stride-1];
2336 int p2= src[y*stride+0];
2337 int p3= src[y*stride+1];
2338 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2339
2340 if (d<-2*strength) d1= 0;
2341 else if(d<- strength) d1=-2*strength - d;
2342 else if(d< strength) d1= d;
2343 else if(d< 2*strength) d1= 2*strength - d;
2344 else d1= 0;
2345
2346 p1 += d1;
2347 p2 -= d1;
2348 if(p1&256) p1= ~(p1>>31);
2349 if(p2&256) p2= ~(p2>>31);
2350
2351 src[y*stride-1] = p1;
2352 src[y*stride+0] = p2;
2353
2354 ad1= ABS(d1)>>1;
2355
2356 d2= clip((p0-p3)/4, -ad1, ad1);
2357
2358 src[y*stride-2] = p0 - d2;
2359 src[y*stride+1] = p3 + d2;
2360 }
2361}
1457ab52 2362
c6148de2
MN
2363static void h261_v_loop_filter_c(uint8_t *dest,uint8_t *src, int stride){
2364 int i,j,xy,yz;
2365 int res;
2366 for(i=0; i<8; i++){
2367 for(j=1; j<7; j++){
2368 xy = j * stride + i;
2369 yz = j * 8 + i;
2370 res = (int)src[yz-1*8] + ((int)(src[yz+0*8]) * 2) + (int)src[yz+1*8];
2371 res +=2;
2372 res >>=2;
2373 dest[xy] = (uint8_t)res;
2374 }
2375 }
2376}
2377
2378static void h261_h_loop_filter_c(uint8_t *dest,uint8_t *src, int stride){
2379 int i,j,xy,yz;
2380 int res;
2381 for(i=1; i<7; i++){
2382 for(j=0; j<8; j++){
2383 xy = j * stride + i;
2384 yz = j * 8 + i;
2385 res = (int)src[yz-1] + ((int)(src[yz]) *2) + (int)src[yz+1];
2386 res+=2;
2387 res>>=2;
2388 dest[xy] = (uint8_t)res;
2389 }
2390 }
2391}
2392
bb198e19 2393static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2394{
2395 int s, i;
2396
2397 s = 0;
bb198e19 2398 for(i=0;i<h;i++) {
de6d9b64
FB
2399 s += abs(pix1[0] - pix2[0]);
2400 s += abs(pix1[1] - pix2[1]);
2401 s += abs(pix1[2] - pix2[2]);
2402 s += abs(pix1[3] - pix2[3]);
2403 s += abs(pix1[4] - pix2[4]);
2404 s += abs(pix1[5] - pix2[5]);
2405 s += abs(pix1[6] - pix2[6]);
2406 s += abs(pix1[7] - pix2[7]);
2407 s += abs(pix1[8] - pix2[8]);
2408 s += abs(pix1[9] - pix2[9]);
2409 s += abs(pix1[10] - pix2[10]);
2410 s += abs(pix1[11] - pix2[11]);
2411 s += abs(pix1[12] - pix2[12]);
2412 s += abs(pix1[13] - pix2[13]);
2413 s += abs(pix1[14] - pix2[14]);
2414 s += abs(pix1[15] - pix2[15]);
2415 pix1 += line_size;
2416 pix2 += line_size;
2417 }
2418 return s;
2419}
2420
bb198e19 2421static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2422{
2423 int s, i;
2424
2425 s = 0;
bb198e19 2426 for(i=0;i<h;i++) {
de6d9b64
FB
2427 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2428 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2429 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2430 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2431 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2432 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2433 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2434 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2435 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2436 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2437 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2438 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2439 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2440 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2441 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2442 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2443 pix1 += line_size;
2444 pix2 += line_size;
2445 }
2446 return s;
2447}
2448
bb198e19 2449static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2450{
2451 int s, i;
0c1a9eda 2452 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2453
2454 s = 0;
bb198e19 2455 for(i=0;i<h;i++) {
de6d9b64
FB
2456 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2457 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2458 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2459 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2460 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2461 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2462 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2463 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2464 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2465 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2466 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2467 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2468 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2469 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2470 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2471 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2472 pix1 += line_size;
2473 pix2 += line_size;
2474 pix3 += line_size;
2475 }
2476 return s;
2477}
2478
bb198e19 2479static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2480{
2481 int s, i;
0c1a9eda 2482 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2483
2484 s = 0;
bb198e19 2485 for(i=0;i<h;i++) {
de6d9b64
FB
2486 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2487 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2488 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2489 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2490 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2491 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2492 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2493 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2494 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2495 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2496 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2497 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2498 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2499 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2500 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2501 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2502 pix1 += line_size;
2503 pix2 += line_size;
2504 pix3 += line_size;
2505 }
2506 return s;
2507}
2508
bb198e19 2509static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2510{
2511 int s, i;
2512
2513 s = 0;
bb198e19 2514 for(i=0;i<h;i++) {
ba6802de
MN
2515 s += abs(pix1[0] - pix2[0]);
2516 s += abs(pix1[1] - pix2[1]);
2517 s += abs(pix1[2] - pix2[2]);
2518 s += abs(pix1[3] - pix2[3]);
2519 s += abs(pix1[4] - pix2[4]);
2520 s += abs(pix1[5] - pix2[5]);
2521 s += abs(pix1[6] - pix2[6]);
2522 s += abs(pix1[7] - pix2[7]);
2523 pix1 += line_size;
2524 pix2 += line_size;
2525 }
2526 return s;
2527}
2528
bb198e19 2529static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2530{
2531 int s, i;
2532
2533 s = 0;
bb198e19 2534 for(i=0;i<h;i++) {
ba6802de
MN
2535 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2536 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2537 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2538 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2539 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2540 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2541 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2542 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2543 pix1 += line_size;
2544 pix2 += line_size;
2545 }
2546 return s;
2547}
2548
bb198e19 2549static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2550{
2551 int s, i;
0c1a9eda 2552 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2553
2554 s = 0;
bb198e19 2555 for(i=0;i<h;i++) {
ba6802de
MN
2556 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2557 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2558 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2559 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2560 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2561 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2562 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2563 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2564 pix1 += line_size;
2565 pix2 += line_size;
2566 pix3 += line_size;
2567 }
2568 return s;
2569}
2570
bb198e19 2571static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2572{
2573 int s, i;
0c1a9eda 2574 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2575
2576 s = 0;
bb198e19 2577 for(i=0;i<h;i++) {
ba6802de
MN
2578 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2579 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2580 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2581 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2582 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2583 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2584 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2585 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2586 pix1 += line_size;
2587 pix2 += line_size;
2588 pix3 += line_size;
2589 }
2590 return s;
2591}
2592
364a1797
MN
2593static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2594 int i;
2595 unsigned int sum=0;
2596
2597 for(i=0; i<8*8; i++){
2598 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2599 int w= weight[i];
2600 b>>= RECON_SHIFT;
2601 assert(-512<b && b<512);
2602
2603 sum += (w*b)*(w*b)>>4;
2604 }
2605 return sum>>2;
2606}
2607
2608static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2609 int i;
2610
2611 for(i=0; i<8*8; i++){
2612 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2613 }
2614}
2615
a9badb51
MN
2616/**
2617 * permutes an 8x8 block.
2a5700de 2618 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
2619 * @param permutation the permutation vector
2620 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2a5700de
MN
2621 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2622 * (inverse) permutated to scantable order!
a9badb51 2623 */
0c1a9eda 2624void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 2625{
7801d21d 2626 int i;
477ab036 2627 DCTELEM temp[64];
7801d21d
MN
2628
2629 if(last<=0) return;
9a7b310d 2630 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 2631
7801d21d
MN
2632 for(i=0; i<=last; i++){
2633 const int j= scantable[i];
2634 temp[j]= block[j];
2635 block[j]=0;
2636 }
2637
2638 for(i=0; i<=last; i++){
2639 const int j= scantable[i];
2640 const int perm_j= permutation[j];
2641 block[perm_j]= temp[j];
2642 }
d962f6fd 2643}
e0eac44e 2644
622348f9
MN
2645static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2646 return 0;
2647}
2648
2649void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2650 int i;
2651
2652 memset(cmp, 0, sizeof(void*)*5);
2653
2654 for(i=0; i<5; i++){
2655 switch(type&0xFF){
2656 case FF_CMP_SAD:
2657 cmp[i]= c->sad[i];
2658 break;
2659 case FF_CMP_SATD:
2660 cmp[i]= c->hadamard8_diff[i];
2661 break;
2662 case FF_CMP_SSE:
2663 cmp[i]= c->sse[i];
2664 break;
2665 case FF_CMP_DCT:
2666 cmp[i]= c->dct_sad[i];
2667 break;
2668 case FF_CMP_PSNR:
2669 cmp[i]= c->quant_psnr[i];
2670 break;
2671 case FF_CMP_BIT:
2672 cmp[i]= c->bit[i];
2673 break;
2674 case FF_CMP_RD:
2675 cmp[i]= c->rd[i];
2676 break;
2677 case FF_CMP_VSAD:
2678 cmp[i]= c->vsad[i];
2679 break;
2680 case FF_CMP_VSSE:
2681 cmp[i]= c->vsse[i];
2682 break;
2683 case FF_CMP_ZERO:
2684 cmp[i]= zero_cmp;
2685 break;
2686 default:
2687 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2688 }
2689 }
2690}
2691
2a5700de
MN
2692/**
2693 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2694 */
eb4b3dd3 2695static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
2696{
2697 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2698}
2699
11f18faf
MN
2700static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2701 int i;
d32ac509 2702 for(i=0; i+7<w; i+=8){
11f18faf
MN
2703 dst[i+0] += src[i+0];
2704 dst[i+1] += src[i+1];
2705 dst[i+2] += src[i+2];
2706 dst[i+3] += src[i+3];
2707 dst[i+4] += src[i+4];
2708 dst[i+5] += src[i+5];
2709 dst[i+6] += src[i+6];
2710 dst[i+7] += src[i+7];
2711 }
2712 for(; i<w; i++)
2713 dst[i+0] += src[i+0];
2714}
2715
2716static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2717 int i;
d32ac509 2718 for(i=0; i+7<w; i+=8){
11f18faf
MN
2719 dst[i+0] = src1[i+0]-src2[i+0];
2720 dst[i+1] = src1[i+1]-src2[i+1];
2721 dst[i+2] = src1[i+2]-src2[i+2];
2722 dst[i+3] = src1[i+3]-src2[i+3];
2723 dst[i+4] = src1[i+4]-src2[i+4];
2724 dst[i+5] = src1[i+5]-src2[i+5];
2725 dst[i+6] = src1[i+6]-src2[i+6];
2726 dst[i+7] = src1[i+7]-src2[i+7];
2727 }
2728 for(; i<w; i++)
2729 dst[i+0] = src1[i+0]-src2[i+0];
2730}
2731
84705403
MN
2732static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2733 int i;
2734 uint8_t l, lt;
2735
2736 l= *left;
2737 lt= *left_top;
2738
2739 for(i=0; i<w; i++){
2740 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2741 lt= src1[i];
2742 l= src2[i];
2743 dst[i]= l - pred;
2744 }
2745
2746 *left= l;
2747 *left_top= lt;
2748}
2749
1457ab52
MN
2750#define BUTTERFLY2(o1,o2,i1,i2) \
2751o1= (i1)+(i2);\
2752o2= (i1)-(i2);
2753
2754#define BUTTERFLY1(x,y) \
2755{\
2756 int a,b;\
2757 a= x;\
2758 b= y;\
2759 x= a+b;\
2760 y= a-b;\
2761}
2762
2763#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2764
bb198e19 2765static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1457ab52
MN
2766 int i;
2767 int temp[64];
2768 int sum=0;
bb198e19
MN
2769
2770 assert(h==8);
1457ab52
MN
2771
2772 for(i=0; i<8; i++){
2773 //FIXME try pointer walks
2774 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2775 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2776 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2777 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2778
2779 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2780 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2781 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2782 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2783
2784 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2785 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2786 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2787 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2788 }
2789
2790 for(i=0; i<8; i++){
2791 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2792 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2793 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2794 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2795
2796 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2797 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2798 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2799 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2800
2801 sum +=
2802 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2803 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2804 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2805 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2806 }
2807#if 0
2808static int maxi=0;
2809if(sum>maxi){
2810 maxi=sum;
2811 printf("MAX:%d\n", maxi);
2812}
2813#endif
2814 return sum;
2815}
2816
622348f9 2817static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1457ab52
MN
2818 int i;
2819 int temp[64];
2820 int sum=0;
622348f9
MN
2821
2822 assert(h==8);
2823
1457ab52
MN
2824 for(i=0; i<8; i++){
2825 //FIXME try pointer walks
622348f9
MN
2826 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2827 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2828 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2829 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1457ab52
MN
2830
2831 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2832 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2833 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2834 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2835
2836 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2837 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2838 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2839 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2840 }
2841
2842 for(i=0; i<8; i++){
2843 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2844 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2845 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2846 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2847
2848 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2849 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2850 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2851 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2852
2853 sum +=
2854 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2855 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2856 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2857 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2858 }
2859
622348f9
MN
2860 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2861
1457ab52
MN
2862 return sum;
2863}
2864
bb198e19 2865static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 2866 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
2867 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2868 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1457ab52 2869 int sum=0, i;
bb198e19
MN
2870
2871 assert(h==8);
1457ab52
MN
2872
2873 s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839 2874 s->dsp.fdct(temp);
1457ab52
MN
2875
2876 for(i=0; i<64; i++)
2877 sum+= ABS(temp[i]);
2878
2879 return sum;
2880}
2881
0e15384d 2882void simple_idct(DCTELEM *block); //FIXME
1457ab52 2883
bb198e19 2884static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 2885 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
2886 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2887 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2888 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1457ab52
MN
2889 int sum=0, i;
2890
bb198e19 2891 assert(h==8);
1457ab52
MN
2892 s->mb_intra=0;
2893
2894 s->dsp.diff_pixels(temp, src1, src2, stride);
2895
2896 memcpy(bak, temp, 64*sizeof(DCTELEM));
2897
67725183 2898 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
d50635cd 2899 s->dct_unquantize_inter(s, temp, 0, s->qscale);
1457ab52
MN
2900 simple_idct(temp); //FIXME
2901
2902 for(i=0; i<64; i++)
2903 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2904
2905 return sum;
2906}
2907
bb198e19 2908static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 2909 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 2910 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
2911 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2912 uint64_t __align8 aligned_bak[stride];
2913 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2914 uint8_t * const bak= (uint8_t*)aligned_bak;
3a87ac94
MN
2915 int i, last, run, bits, level, distoration, start_i;
2916 const int esc_length= s->ac_esc_length;
2917 uint8_t * length;
2918 uint8_t * last_length;
67725183 2919
bb198e19
MN
2920 assert(h==8);
2921
67725183
MN
2922 for(i=0; i<8; i++){
2923 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2924 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2925 }
3a87ac94 2926
67725183
MN
2927 s->dsp.diff_pixels(temp, src1, src2, stride);
2928
2929 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2930
2931 bits=0;
3a87ac94
MN
2932
2933 if (s->mb_intra) {
67725183 2934 start_i = 1;
3a87ac94
MN
2935 length = s->intra_ac_vlc_length;
2936 last_length= s->intra_ac_vlc_last_length;
67725183 2937 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
2938 } else {
2939 start_i = 0;
2940 length = s->inter_ac_vlc_length;
2941 last_length= s->inter_ac_vlc_last_length;
2942 }
3a87ac94 2943
67725183 2944 if(last>=start_i){
3a87ac94
MN
2945 run=0;
2946 for(i=start_i; i<last; i++){
2947 int j= scantable[i];
2948 level= temp[j];
2949
2950 if(level){
2951 level+=64;
2952 if((level&(~127)) == 0){
2953 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2954 }else
2955 bits+= esc_length;
2956 run=0;
2957 }else
2958 run++;
2959 }
2960 i= scantable[last];
1d0eab1d 2961
3a87ac94 2962 level= temp[i] + 64;
1d0eab1d
MN
2963
2964 assert(level - 64);
2965
3a87ac94
MN
2966 if((level&(~127)) == 0){
2967 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2968 }else
2969 bits+= esc_length;
2970
67725183
MN
2971 }
2972
2973 if(last>=0){
d50635cd
MN
2974 if(s->mb_intra)
2975 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2976 else
2977 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3a87ac94
MN
2978 }
2979
b0368839 2980 s->dsp.idct_add(bak, stride, temp);
3a87ac94 2981
bb198e19 2982 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3a87ac94 2983
67725183 2984 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3a87ac94
MN
2985}
2986
bb198e19 2987static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 2988 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 2989 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
2990 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2991 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3a87ac94
MN
2992 int i, last, run, bits, level, start_i;
2993 const int esc_length= s->ac_esc_length;
2994 uint8_t * length;
2995 uint8_t * last_length;
bb198e19
MN
2996
2997 assert(h==8);
67725183
MN
2998
2999 s->dsp.diff_pixels(temp, src1, src2, stride);
3a87ac94 3000
67725183
MN
3001 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3002
3003 bits=0;
3a87ac94
MN
3004
3005 if (s->mb_intra) {
67725183 3006 start_i = 1;
3a87ac94
MN
3007 length = s->intra_ac_vlc_length;
3008 last_length= s->intra_ac_vlc_last_length;
67725183 3009 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3010 } else {
3011 start_i = 0;
3012 length = s->inter_ac_vlc_length;
3013 last_length= s->inter_ac_vlc_last_length;
3014 }
3a87ac94 3015
67725183 3016 if(last>=start_i){
3a87ac94
MN
3017 run=0;
3018 for(i=start_i; i<last; i++){
3019 int j= scantable[i];
3020 level= temp[j];
3021
3022 if(level){
3023 level+=64;
3024 if((level&(~127)) == 0){
3025 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3026 }else
3027 bits+= esc_length;
3028 run=0;
3029 }else
3030 run++;
3031 }
3032 i= scantable[last];
67725183
MN
3033
3034 level= temp[i] + 64;
3a87ac94 3035
67725183 3036 assert(level - 64);
3a87ac94 3037
3a87ac94
MN
3038 if((level&(~127)) == 0){
3039 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3040 }else
3041 bits+= esc_length;
3042 }
3043
3044 return bits;
3045}
3046
622348f9
MN
3047static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3048 int score=0;
3049 int x,y;
3050
3051 for(y=1; y<h; y++){
3052 for(x=0; x<16; x+=4){
3053 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3054 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3055 }
3056 s+= stride;
3057 }
3058
3059 return score;
3060}
3061
3062static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3063 int score=0;
3064 int x,y;
3065
3066 for(y=1; y<h; y++){
3067 for(x=0; x<16; x++){
3068 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3069 }
3070 s1+= stride;
3071 s2+= stride;
3072 }
3073
3074 return score;
3075}
3076
3077#define SQ(a) ((a)*(a))
3078static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3079 int score=0;
3080 int x,y;
3081
3082 for(y=1; y<h; y++){
3083 for(x=0; x<16; x+=4){
3084 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3085 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3086 }
3087 s+= stride;
3088 }
3089
3090 return score;
3091}
3092
3093static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3094 int score=0;
3095 int x,y;
3096
3097 for(y=1; y<h; y++){
3098 for(x=0; x<16; x++){
3099 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3100 }
3101 s1+= stride;
3102 s2+= stride;
3103 }
3104
3105 return score;
3106}
3107
bb198e19 3108WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
622348f9 3109WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
bb198e19
MN
3110WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3111WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3112WARPER8_16_SQ(rd8x8_c, rd16_c)
3113WARPER8_16_SQ(bit8x8_c, bit16_c)
1457ab52 3114
b0368839
MN
3115/* XXX: those functions should be suppressed ASAP when all IDCTs are
3116 converted */
3117static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3118{
3119 j_rev_dct (block);
3120 put_pixels_clamped_c(block, dest, line_size);
3121}
3122static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3123{
3124 j_rev_dct (block);
3125 add_pixels_clamped_c(block, dest, line_size);
3126}
3127
59cf08ce
FB
3128/* init static data */
3129void dsputil_static_init(void)
e0eac44e 3130{
d2975f8d 3131 int i;
e0eac44e 3132
59cf08ce
FB
3133 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3134 for(i=0;i<MAX_NEG_CROP;i++) {
3135 cropTbl[i] = 0;
3136 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3137 }
3138
3139 for(i=0;i<512;i++) {
3140 squareTbl[i] = (i - 256) * (i - 256);
3141 }
3142
3143 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3144}
92ddb692 3145
92ddb692 3146
59cf08ce
FB
3147void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3148{
3149 int i;
de6d9b64 3150
b0368839 3151#ifdef CONFIG_ENCODERS
10acc479 3152 if(avctx->dct_algo==FF_DCT_FASTINT) {
b0368839 3153 c->fdct = fdct_ifast;
48b1f800 3154 c->fdct248 = fdct_ifast248;
10acc479
RS
3155 }
3156 else if(avctx->dct_algo==FF_DCT_FAAN) {
65e4c8c9 3157 c->fdct = ff_faandct;
48b1f800 3158 c->fdct248 = ff_faandct248;
10acc479
RS
3159 }
3160 else {
b0368839 3161 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
10acc479
RS
3162 c->fdct248 = ff_fdct248_islow;
3163 }
b0368839
MN
3164#endif //CONFIG_ENCODERS
3165
3166 if(avctx->idct_algo==FF_IDCT_INT){
3167 c->idct_put= ff_jref_idct_put;
3168 c->idct_add= ff_jref_idct_add;
4fb518c3 3169 c->idct = j_rev_dct;
b0368839
MN
3170 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3171 }else{ //accurate/default
3172 c->idct_put= simple_idct_put;
3173 c->idct_add= simple_idct_add;
4fb518c3 3174 c->idct = simple_idct;
b0368839
MN
3175 c->idct_permutation_type= FF_NO_IDCT_PERM;
3176 }
3177
44cb64ee
MM
3178 /* VP3 DSP support */
3179 c->vp3_dsp_init = vp3_dsp_init_c;
116824d0 3180 c->vp3_idct = vp3_idct_c;
44cb64ee 3181
eb4b3dd3
ZK
3182 c->get_pixels = get_pixels_c;
3183 c->diff_pixels = diff_pixels_c;
3184 c->put_pixels_clamped = put_pixels_clamped_c;
f9ed9d85 3185 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
eb4b3dd3
ZK
3186 c->add_pixels_clamped = add_pixels_clamped_c;
3187 c->gmc1 = gmc1_c;
3188 c->gmc = gmc_c;
3189 c->clear_blocks = clear_blocks_c;
3190 c->pix_sum = pix_sum_c;
3191 c->pix_norm1 = pix_norm1_c;
3192
45553457 3193 /* TODO [0] 16 [1] 8 */
bb198e19
MN
3194 c->pix_abs[0][0] = pix_abs16_c;
3195 c->pix_abs[0][1] = pix_abs16_x2_c;
3196 c->pix_abs[0][2] = pix_abs16_y2_c;
3197 c->pix_abs[0][3] = pix_abs16_xy2_c;
3198 c->pix_abs[1][0] = pix_abs8_c;
3199 c->pix_abs[1][1] = pix_abs8_x2_c;
3200 c->pix_abs[1][2] = pix_abs8_y2_c;
3201 c->pix_abs[1][3] = pix_abs8_xy2_c;
eb4b3dd3 3202
45553457
ZK
3203#define dspfunc(PFX, IDX, NUM) \
3204 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3205 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3206 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3207 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3208
3209 dspfunc(put, 0, 16);
3210 dspfunc(put_no_rnd, 0, 16);
3211 dspfunc(put, 1, 8);
3212 dspfunc(put_no_rnd, 1, 8);
669ac79c
MN
3213 dspfunc(put, 2, 4);
3214 dspfunc(put, 3, 2);
45553457
ZK
3215
3216 dspfunc(avg, 0, 16);
3217 dspfunc(avg_no_rnd, 0, 16);
3218 dspfunc(avg, 1, 8);
3219 dspfunc(avg_no_rnd, 1, 8);
da3b9756
MM
3220 dspfunc(avg, 2, 4);
3221 dspfunc(avg, 3, 2);
45553457
ZK
3222#undef dspfunc
3223
c0a0170c
MN
3224 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3225 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3226
669ac79c
MN
3227 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3228 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3229 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3230 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3231 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3232 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3233 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3234 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3235 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3236
da3b9756
MM
3237 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3238 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3239 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3240 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3241 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3242 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3243 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3244 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3245 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3246
45553457
ZK
3247#define dspfunc(PFX, IDX, NUM) \
3248 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3249 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3250 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3251 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3252 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3253 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3254 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3255 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3256 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3257 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3258 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3259 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3260 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3261 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3262 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3263 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3264
3265 dspfunc(put_qpel, 0, 16);
3266 dspfunc(put_no_rnd_qpel, 0, 16);
3267
3268 dspfunc(avg_qpel, 0, 16);
3269 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3270
3271 dspfunc(put_qpel, 1, 8);
3272 dspfunc(put_no_rnd_qpel, 1, 8);
3273
3274 dspfunc(avg_qpel, 1, 8);
3275 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
0da71265
MN
3276
3277 dspfunc(put_h264_qpel, 0, 16);
3278 dspfunc(put_h264_qpel, 1, 8);
3279 dspfunc(put_h264_qpel, 2, 4);
3280 dspfunc(avg_h264_qpel, 0, 16);
3281 dspfunc(avg_h264_qpel, 1, 8);
3282 dspfunc(avg_h264_qpel, 2, 4);
3283
45553457 3284#undef dspfunc
0da71265
MN
3285 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3286 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3287 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3288 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3289 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3290 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
c9a2ebc4 3291
1457ab52
MN
3292 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3293 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3294 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3295 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3296 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3297 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3298 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3299 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
669ac79c 3300
bb198e19
MN
3301#define SET_CMP_FUNC(name) \
3302 c->name[0]= name ## 16_c;\
3303 c->name[1]= name ## 8x8_c;
3304
3305 SET_CMP_FUNC(hadamard8_diff)
622348f9 3306 c->hadamard8_diff[4]= hadamard8_intra16_c;
bb198e19
MN
3307 SET_CMP_FUNC(dct_sad)
3308 c->sad[0]= pix_abs16_c;
3309 c->sad[1]= pix_abs8_c;
3310 c->sse[0]= sse16_c;
3311 c->sse[1]= sse8_c;
3312 SET_CMP_FUNC(quant_psnr)
3313 SET_CMP_FUNC(rd)
3314 SET_CMP_FUNC(bit)
622348f9
MN
3315 c->vsad[0]= vsad16_c;
3316 c->vsad[4]= vsad_intra16_c;
3317 c->vsse[0]= vsse16_c;
3318 c->vsse[4]= vsse_intra16_c;
3a87ac94 3319
11f18faf
MN
3320 c->add_bytes= add_bytes_c;
3321 c->diff_bytes= diff_bytes_c;
84705403 3322 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3d2e8cce 3323 c->bswap_buf= bswap_buf;
332f9ac4
MN
3324
3325 c->h263_h_loop_filter= h263_h_loop_filter_c;
3326 c->h263_v_loop_filter= h263_v_loop_filter_c;
364a1797 3327
c6148de2
MN
3328 c->h261_h_loop_filter= h261_h_loop_filter_c;
3329 c->h261_v_loop_filter= h261_v_loop_filter_c;
3330
364a1797
MN
3331 c->try_8x8basis= try_8x8basis_c;
3332 c->add_8x8basis= add_8x8basis_c;
11f18faf 3333
980fc7b8 3334#ifdef HAVE_MMX
b0368839 3335 dsputil_init_mmx(c, avctx);
de6d9b64 3336#endif
3d03c0a2 3337#ifdef ARCH_ARMV4L
b0368839 3338 dsputil_init_armv4l(c, avctx);
3d03c0a2 3339#endif
c34270f5 3340#ifdef HAVE_MLIB
b0368839 3341 dsputil_init_mlib(c, avctx);
c34270f5 3342#endif
44f54ceb
MN
3343#ifdef ARCH_SPARC
3344 dsputil_init_vis(c,avctx);
3345#endif
1e98dffb 3346#ifdef ARCH_ALPHA
b0368839 3347 dsputil_init_alpha(c, avctx);
1e98dffb 3348#endif
59925ef2 3349#ifdef ARCH_POWERPC
b0368839 3350 dsputil_init_ppc(c, avctx);
a43bd1d7 3351#endif
d46aba26 3352#ifdef HAVE_MMI
b0368839 3353 dsputil_init_mmi(c, avctx);
d46aba26 3354#endif
0c6bd2ea
B
3355#ifdef ARCH_SH4
3356 dsputil_init_sh4(c,avctx);
3357#endif
43f1708f 3358
b0368839
MN
3359 switch(c->idct_permutation_type){
3360 case FF_NO_IDCT_PERM:
3361 for(i=0; i<64; i++)
3362 c->idct_permutation[i]= i;
3363 break;
3364 case FF_LIBMPEG2_IDCT_PERM:
3365 for(i=0; i<64; i++)
3366 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3367 break;
3368 case FF_SIMPLE_IDCT_PERM:
3369 for(i=0; i<64; i++)
3370 c->idct_permutation[i]= simple_mmx_permutation[i];
3371 break;
3372 case FF_TRANSPOSE_IDCT_PERM:
3373 for(i=0; i<64; i++)
3374 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3375 break;
3376 default:
9b879566 3377 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
b0368839 3378 }
57060b1e 3379}
b0368839 3380