10l
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
1457ab52 23#include "mpegvideo.h"
45553457 24
5596c60c
MN
25int ff_bit_exact=0;
26
0cfa9713 27UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
28UINT32 squareTbl[512];
29
2ad1516a
MN
30const UINT8 ff_zigzag_direct[64] = {
31 0, 1, 8, 16, 9, 2, 3, 10,
32 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 33 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 34 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
35 35, 42, 49, 56, 57, 50, 43, 36,
36 29, 22, 15, 23, 30, 37, 44, 51,
37 58, 59, 52, 45, 38, 31, 39, 46,
38 53, 60, 61, 54, 47, 55, 62, 63
39};
40
2f349de2
MN
41/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
42UINT16 __align8 inv_zigzag_direct16[64];
43
2ad1516a
MN
44const UINT8 ff_alternate_horizontal_scan[64] = {
45 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
46 10, 11, 4, 5, 6, 7, 15, 14,
47 13, 12, 19, 18, 24, 25, 32, 33,
48 26, 27, 20, 21, 22, 23, 28, 29,
49 30, 31, 34, 35, 40, 41, 48, 49,
50 42, 43, 36, 37, 38, 39, 44, 45,
51 46, 47, 50, 51, 56, 57, 58, 59,
52 52, 53, 54, 55, 60, 61, 62, 63,
53};
54
2ad1516a
MN
55const UINT8 ff_alternate_vertical_scan[64] = {
56 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
57 17, 25, 32, 40, 48, 56, 57, 49,
58 41, 33, 26, 18, 3, 11, 4, 12,
59 19, 27, 34, 42, 50, 58, 35, 43,
60 51, 59, 20, 28, 5, 13, 6, 14,
61 21, 29, 36, 44, 52, 60, 37, 45,
62 53, 61, 22, 30, 7, 15, 23, 31,
63 38, 46, 54, 62, 39, 47, 55, 63,
64};
65
2f349de2 66/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
eb4b3dd3 67const UINT32 inverse[256]={
2f349de2
MN
68 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
69 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
70 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
71 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
72 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
73 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
74 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
75 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
76 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
77 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
78 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
79 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
80 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
81 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
82 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
83 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
84 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
85 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
86 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
87 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
88 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
89 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
90 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
91 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
92 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
93 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
94 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
95 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
96 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
97 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
98 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
99 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
100};
101
eb4b3dd3 102static int pix_sum_c(UINT8 * pix, int line_size)
3aa102be
MN
103{
104 int s, i, j;
105
106 s = 0;
107 for (i = 0; i < 16; i++) {
108 for (j = 0; j < 16; j += 8) {
109 s += pix[0];
110 s += pix[1];
111 s += pix[2];
112 s += pix[3];
113 s += pix[4];
114 s += pix[5];
115 s += pix[6];
116 s += pix[7];
117 pix += 8;
118 }
119 pix += line_size - 16;
120 }
121 return s;
122}
123
eb4b3dd3 124static int pix_norm1_c(UINT8 * pix, int line_size)
3aa102be
MN
125{
126 int s, i, j;
127 UINT32 *sq = squareTbl + 256;
128
129 s = 0;
130 for (i = 0; i < 16; i++) {
131 for (j = 0; j < 16; j += 8) {
2a006cd3 132#if 0
3aa102be
MN
133 s += sq[pix[0]];
134 s += sq[pix[1]];
135 s += sq[pix[2]];
136 s += sq[pix[3]];
137 s += sq[pix[4]];
138 s += sq[pix[5]];
139 s += sq[pix[6]];
140 s += sq[pix[7]];
2a006cd3
FL
141#else
142#if LONG_MAX > 2147483647
143 register uint64_t x=*(uint64_t*)pix;
144 s += sq[x&0xff];
145 s += sq[(x>>8)&0xff];
146 s += sq[(x>>16)&0xff];
147 s += sq[(x>>24)&0xff];
148 s += sq[(x>>32)&0xff];
149 s += sq[(x>>40)&0xff];
150 s += sq[(x>>48)&0xff];
151 s += sq[(x>>56)&0xff];
152#else
153 register uint32_t x=*(uint32_t*)pix;
154 s += sq[x&0xff];
155 s += sq[(x>>8)&0xff];
156 s += sq[(x>>16)&0xff];
157 s += sq[(x>>24)&0xff];
158 x=*(uint32_t*)(pix+4);
159 s += sq[x&0xff];
160 s += sq[(x>>8)&0xff];
161 s += sq[(x>>16)&0xff];
162 s += sq[(x>>24)&0xff];
163#endif
164#endif
3aa102be
MN
165 pix += 8;
166 }
167 pix += line_size - 16;
168 }
169 return s;
170}
171
172
1457ab52
MN
173static int sse8_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
174{
175 int s, i;
176 UINT32 *sq = squareTbl + 256;
177
178 s = 0;
179 for (i = 0; i < 8; i++) {
180 s += sq[pix1[0] - pix2[0]];
181 s += sq[pix1[1] - pix2[1]];
182 s += sq[pix1[2] - pix2[2]];
183 s += sq[pix1[3] - pix2[3]];
184 s += sq[pix1[4] - pix2[4]];
185 s += sq[pix1[5] - pix2[5]];
186 s += sq[pix1[6] - pix2[6]];
187 s += sq[pix1[7] - pix2[7]];
188 pix1 += line_size;
189 pix2 += line_size;
190 }
191 return s;
192}
193
194static int sse16_c(void *v, UINT8 * pix1, UINT8 * pix2, int line_size)
9c76bd48
BF
195{
196 int s, i, j;
197 UINT32 *sq = squareTbl + 256;
198
199 s = 0;
200 for (i = 0; i < 16; i++) {
201 for (j = 0; j < 16; j += 8) {
2a006cd3
FL
202#if 1
203#if LONG_MAX > 2147483647
204 uint64_t x,y;
205 x=*(uint64_t*)pix1;
206 y=*(uint64_t*)pix2;
207
208 s += sq[(x&0xff) - (y&0xff)];
209 s += sq[((x>>8)&0xff) - ((y>>8)&0xff)];
210 s += sq[((x>>16)&0xff) - ((y>>16)&0xff)];
211 s += sq[((x>>24)&0xff) - ((y>>24)&0xff)];
212 s += sq[((x>>32)&0xff) - ((y>>32)&0xff)];
213 s += sq[((x>>40)&0xff) - ((y>>40)&0xff)];
214 s += sq[((x>>48)&0xff) - ((y>>48)&0xff)];
215 s += sq[((x>>56)&0xff) - ((y>>56)&0xff)];
216#else
217 uint32_t x,y;
218 x=*(uint32_t*)pix1;
219 y=*(uint32_t*)pix2;
220
221 s += sq[(x&0xff) - (y&0xff)];
222 s += sq[((x>>8)&0xff) - ((y>>8)&0xff)];
223 s += sq[((x>>16)&0xff) - ((y>>16)&0xff)];
224 s += sq[((x>>24)&0xff) - ((y>>24)&0xff)];
225
226 x=*(uint32_t*)(pix1+4);
227 y=*(uint32_t*)(pix2+4);
228 s += sq[(x&0xff) - (y&0xff)];
229 s += sq[((x>>8)&0xff) - ((y>>8)&0xff)];
230 s += sq[((x>>16)&0xff) - ((y>>16)&0xff)];
231 s += sq[((x>>24)&0xff) - ((y>>24)&0xff)];
232#endif
233#else
9c76bd48
BF
234 s += sq[pix1[0] - pix2[0]];
235 s += sq[pix1[1] - pix2[1]];
236 s += sq[pix1[2] - pix2[2]];
237 s += sq[pix1[3] - pix2[3]];
238 s += sq[pix1[4] - pix2[4]];
239 s += sq[pix1[5] - pix2[5]];
240 s += sq[pix1[6] - pix2[6]];
241 s += sq[pix1[7] - pix2[7]];
2a006cd3 242#endif
9c76bd48
BF
243 pix1 += 8;
244 pix2 += 8;
245 }
246 pix1 += line_size - 16;
247 pix2 += line_size - 16;
248 }
249 return s;
250}
251
eb4b3dd3 252static void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
de6d9b64 253{
de6d9b64
FB
254 int i;
255
256 /* read the pixels */
de6d9b64 257 for(i=0;i<8;i++) {
c13e1abd
FH
258 block[0] = pixels[0];
259 block[1] = pixels[1];
260 block[2] = pixels[2];
261 block[3] = pixels[3];
262 block[4] = pixels[4];
263 block[5] = pixels[5];
264 block[6] = pixels[6];
265 block[7] = pixels[7];
266 pixels += line_size;
267 block += 8;
de6d9b64
FB
268 }
269}
270
eb4b3dd3
ZK
271static void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1,
272 const UINT8 *s2, int stride){
9dbcbd92
MN
273 int i;
274
275 /* read the pixels */
9dbcbd92 276 for(i=0;i<8;i++) {
c13e1abd
FH
277 block[0] = s1[0] - s2[0];
278 block[1] = s1[1] - s2[1];
279 block[2] = s1[2] - s2[2];
280 block[3] = s1[3] - s2[3];
281 block[4] = s1[4] - s2[4];
282 block[5] = s1[5] - s2[5];
283 block[6] = s1[6] - s2[6];
284 block[7] = s1[7] - s2[7];
9dbcbd92
MN
285 s1 += stride;
286 s2 += stride;
c13e1abd 287 block += 8;
9dbcbd92
MN
288 }
289}
290
291
eb4b3dd3
ZK
292static void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
293 int line_size)
de6d9b64 294{
de6d9b64
FB
295 int i;
296 UINT8 *cm = cropTbl + MAX_NEG_CROP;
297
298 /* read the pixels */
de6d9b64 299 for(i=0;i<8;i++) {
c13e1abd
FH
300 pixels[0] = cm[block[0]];
301 pixels[1] = cm[block[1]];
302 pixels[2] = cm[block[2]];
303 pixels[3] = cm[block[3]];
304 pixels[4] = cm[block[4]];
305 pixels[5] = cm[block[5]];
306 pixels[6] = cm[block[6]];
307 pixels[7] = cm[block[7]];
308
309 pixels += line_size;
310 block += 8;
de6d9b64
FB
311 }
312}
313
eb4b3dd3 314static void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
c13e1abd 315 int line_size)
de6d9b64 316{
de6d9b64
FB
317 int i;
318 UINT8 *cm = cropTbl + MAX_NEG_CROP;
319
320 /* read the pixels */
de6d9b64 321 for(i=0;i<8;i++) {
c13e1abd
FH
322 pixels[0] = cm[pixels[0] + block[0]];
323 pixels[1] = cm[pixels[1] + block[1]];
324 pixels[2] = cm[pixels[2] + block[2]];
325 pixels[3] = cm[pixels[3] + block[3]];
326 pixels[4] = cm[pixels[4] + block[4]];
327 pixels[5] = cm[pixels[5] + block[5]];
328 pixels[6] = cm[pixels[6] + block[6]];
329 pixels[7] = cm[pixels[7] + block[7]];
330 pixels += line_size;
331 block += 8;
de6d9b64
FB
332 }
333}
59fe111e
MN
334#if 0
335
336#define PIXOP2(OPNAME, OP) \
b3184779 337static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
338{\
339 int i;\
340 for(i=0; i<h; i++){\
341 OP(*((uint64_t*)block), LD64(pixels));\
342 pixels+=line_size;\
343 block +=line_size;\
344 }\
345}\
346\
45553457 347static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
348{\
349 int i;\
350 for(i=0; i<h; i++){\
351 const uint64_t a= LD64(pixels );\
352 const uint64_t b= LD64(pixels+1);\
353 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
354 pixels+=line_size;\
355 block +=line_size;\
356 }\
357}\
358\
45553457 359static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
360{\
361 int i;\
362 for(i=0; i<h; i++){\
363 const uint64_t a= LD64(pixels );\
364 const uint64_t b= LD64(pixels+1);\
365 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
366 pixels+=line_size;\
367 block +=line_size;\
368 }\
369}\
370\
45553457 371static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
372{\
373 int i;\
374 for(i=0; i<h; i++){\
375 const uint64_t a= LD64(pixels );\
376 const uint64_t b= LD64(pixels+line_size);\
377 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
378 pixels+=line_size;\
379 block +=line_size;\
380 }\
381}\
382\
45553457 383static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
384{\
385 int i;\
386 for(i=0; i<h; i++){\
387 const uint64_t a= LD64(pixels );\
388 const uint64_t b= LD64(pixels+line_size);\
389 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
390 pixels+=line_size;\
391 block +=line_size;\
392 }\
393}\
394\
45553457 395static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
396{\
397 int i;\
398 const uint64_t a= LD64(pixels );\
399 const uint64_t b= LD64(pixels+1);\
400 uint64_t l0= (a&0x0303030303030303ULL)\
401 + (b&0x0303030303030303ULL)\
402 + 0x0202020202020202ULL;\
403 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
404 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
405 uint64_t l1,h1;\
406\
407 pixels+=line_size;\
408 for(i=0; i<h; i+=2){\
409 uint64_t a= LD64(pixels );\
410 uint64_t b= LD64(pixels+1);\
411 l1= (a&0x0303030303030303ULL)\
412 + (b&0x0303030303030303ULL);\
413 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
414 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
415 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
416 pixels+=line_size;\
417 block +=line_size;\
418 a= LD64(pixels );\
419 b= LD64(pixels+1);\
420 l0= (a&0x0303030303030303ULL)\
421 + (b&0x0303030303030303ULL)\
422 + 0x0202020202020202ULL;\
423 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
424 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
425 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
426 pixels+=line_size;\
427 block +=line_size;\
428 }\
429}\
430\
45553457 431static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
432{\
433 int i;\
434 const uint64_t a= LD64(pixels );\
435 const uint64_t b= LD64(pixels+1);\
436 uint64_t l0= (a&0x0303030303030303ULL)\
437 + (b&0x0303030303030303ULL)\
438 + 0x0101010101010101ULL;\
439 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
440 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
441 uint64_t l1,h1;\
442\
443 pixels+=line_size;\
444 for(i=0; i<h; i+=2){\
445 uint64_t a= LD64(pixels );\
446 uint64_t b= LD64(pixels+1);\
447 l1= (a&0x0303030303030303ULL)\
448 + (b&0x0303030303030303ULL);\
449 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
450 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
451 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
452 pixels+=line_size;\
453 block +=line_size;\
454 a= LD64(pixels );\
455 b= LD64(pixels+1);\
456 l0= (a&0x0303030303030303ULL)\
457 + (b&0x0303030303030303ULL)\
458 + 0x0101010101010101ULL;\
459 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
460 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
461 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
462 pixels+=line_size;\
463 block +=line_size;\
464 }\
465}\
466\
45553457
ZK
467CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
468CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
469CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
470CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
471CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
472CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
473CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
474
475#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
476#else // 64 bit variant
477
478#define PIXOP2(OPNAME, OP) \
45553457 479static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
480 int i;\
481 for(i=0; i<h; i++){\
482 OP(*((uint32_t*)(block )), LD32(pixels ));\
483 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
484 pixels+=line_size;\
485 block +=line_size;\
486 }\
487}\
45553457
ZK
488static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
489 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 490}\
59fe111e 491\
b3184779
MN
492static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
493 int src_stride1, int src_stride2, int h){\
59fe111e
MN
494 int i;\
495 for(i=0; i<h; i++){\
b3184779
MN
496 uint32_t a,b;\
497 a= LD32(&src1[i*src_stride1 ]);\
498 b= LD32(&src2[i*src_stride2 ]);\
499 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
500 a= LD32(&src1[i*src_stride1+4]);\
501 b= LD32(&src2[i*src_stride2+4]);\
502 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
503 }\
504}\
505\
b3184779
MN
506static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
507 int src_stride1, int src_stride2, int h){\
59fe111e
MN
508 int i;\
509 for(i=0; i<h; i++){\
b3184779
MN
510 uint32_t a,b;\
511 a= LD32(&src1[i*src_stride1 ]);\
512 b= LD32(&src2[i*src_stride2 ]);\
513 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
514 a= LD32(&src1[i*src_stride1+4]);\
515 b= LD32(&src2[i*src_stride2+4]);\
516 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
517 }\
518}\
519\
b3184779
MN
520static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
521 int src_stride1, int src_stride2, int h){\
522 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
523 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
524}\
525\
526static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
527 int src_stride1, int src_stride2, int h){\
528 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
529 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
530}\
531\
45553457 532static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
533 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
534}\
535\
45553457 536static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
537 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
538}\
539\
45553457 540static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
541 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
542}\
543\
45553457 544static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
545 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
546}\
547\
548static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
549 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
550 int i;\
551 for(i=0; i<h; i++){\
b3184779
MN
552 uint32_t a, b, c, d, l0, l1, h0, h1;\
553 a= LD32(&src1[i*src_stride1]);\
554 b= LD32(&src2[i*src_stride2]);\
555 c= LD32(&src3[i*src_stride3]);\
556 d= LD32(&src4[i*src_stride4]);\
557 l0= (a&0x03030303UL)\
558 + (b&0x03030303UL)\
559 + 0x02020202UL;\
560 h0= ((a&0xFCFCFCFCUL)>>2)\
561 + ((b&0xFCFCFCFCUL)>>2);\
562 l1= (c&0x03030303UL)\
563 + (d&0x03030303UL);\
564 h1= ((c&0xFCFCFCFCUL)>>2)\
565 + ((d&0xFCFCFCFCUL)>>2);\
566 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
567 a= LD32(&src1[i*src_stride1+4]);\
568 b= LD32(&src2[i*src_stride2+4]);\
569 c= LD32(&src3[i*src_stride3+4]);\
570 d= LD32(&src4[i*src_stride4+4]);\
571 l0= (a&0x03030303UL)\
572 + (b&0x03030303UL)\
573 + 0x02020202UL;\
574 h0= ((a&0xFCFCFCFCUL)>>2)\
575 + ((b&0xFCFCFCFCUL)>>2);\
576 l1= (c&0x03030303UL)\
577 + (d&0x03030303UL);\
578 h1= ((c&0xFCFCFCFCUL)>>2)\
579 + ((d&0xFCFCFCFCUL)>>2);\
580 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
581 }\
582}\
b3184779
MN
583static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
584 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
585 int i;\
586 for(i=0; i<h; i++){\
b3184779
MN
587 uint32_t a, b, c, d, l0, l1, h0, h1;\
588 a= LD32(&src1[i*src_stride1]);\
589 b= LD32(&src2[i*src_stride2]);\
590 c= LD32(&src3[i*src_stride3]);\
591 d= LD32(&src4[i*src_stride4]);\
592 l0= (a&0x03030303UL)\
593 + (b&0x03030303UL)\
594 + 0x01010101UL;\
595 h0= ((a&0xFCFCFCFCUL)>>2)\
596 + ((b&0xFCFCFCFCUL)>>2);\
597 l1= (c&0x03030303UL)\
598 + (d&0x03030303UL);\
599 h1= ((c&0xFCFCFCFCUL)>>2)\
600 + ((d&0xFCFCFCFCUL)>>2);\
601 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
602 a= LD32(&src1[i*src_stride1+4]);\
603 b= LD32(&src2[i*src_stride2+4]);\
604 c= LD32(&src3[i*src_stride3+4]);\
605 d= LD32(&src4[i*src_stride4+4]);\
606 l0= (a&0x03030303UL)\
607 + (b&0x03030303UL)\
608 + 0x01010101UL;\
609 h0= ((a&0xFCFCFCFCUL)>>2)\
610 + ((b&0xFCFCFCFCUL)>>2);\
611 l1= (c&0x03030303UL)\
612 + (d&0x03030303UL);\
613 h1= ((c&0xFCFCFCFCUL)>>2)\
614 + ((d&0xFCFCFCFCUL)>>2);\
615 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
616 }\
617}\
b3184779
MN
618static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
619 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
620 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
621 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
622}\
623static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
624 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
625 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
626 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
627}\
59fe111e 628\
45553457 629static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
630{\
631 int j;\
632 for(j=0; j<2; j++){\
633 int i;\
634 const uint32_t a= LD32(pixels );\
635 const uint32_t b= LD32(pixels+1);\
636 uint32_t l0= (a&0x03030303UL)\
637 + (b&0x03030303UL)\
638 + 0x02020202UL;\
639 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
640 + ((b&0xFCFCFCFCUL)>>2);\
641 uint32_t l1,h1;\
642\
643 pixels+=line_size;\
644 for(i=0; i<h; i+=2){\
645 uint32_t a= LD32(pixels );\
646 uint32_t b= LD32(pixels+1);\
647 l1= (a&0x03030303UL)\
648 + (b&0x03030303UL);\
649 h1= ((a&0xFCFCFCFCUL)>>2)\
650 + ((b&0xFCFCFCFCUL)>>2);\
651 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
652 pixels+=line_size;\
653 block +=line_size;\
654 a= LD32(pixels );\
655 b= LD32(pixels+1);\
656 l0= (a&0x03030303UL)\
657 + (b&0x03030303UL)\
658 + 0x02020202UL;\
659 h0= ((a&0xFCFCFCFCUL)>>2)\
660 + ((b&0xFCFCFCFCUL)>>2);\
661 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
662 pixels+=line_size;\
663 block +=line_size;\
664 }\
665 pixels+=4-line_size*(h+1);\
666 block +=4-line_size*h;\
667 }\
668}\
669\
45553457 670static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
671{\
672 int j;\
673 for(j=0; j<2; j++){\
674 int i;\
675 const uint32_t a= LD32(pixels );\
676 const uint32_t b= LD32(pixels+1);\
677 uint32_t l0= (a&0x03030303UL)\
678 + (b&0x03030303UL)\
679 + 0x01010101UL;\
680 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
681 + ((b&0xFCFCFCFCUL)>>2);\
682 uint32_t l1,h1;\
683\
684 pixels+=line_size;\
685 for(i=0; i<h; i+=2){\
686 uint32_t a= LD32(pixels );\
687 uint32_t b= LD32(pixels+1);\
688 l1= (a&0x03030303UL)\
689 + (b&0x03030303UL);\
690 h1= ((a&0xFCFCFCFCUL)>>2)\
691 + ((b&0xFCFCFCFCUL)>>2);\
692 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
693 pixels+=line_size;\
694 block +=line_size;\
695 a= LD32(pixels );\
696 b= LD32(pixels+1);\
697 l0= (a&0x03030303UL)\
698 + (b&0x03030303UL)\
699 + 0x01010101UL;\
700 h0= ((a&0xFCFCFCFCUL)>>2)\
701 + ((b&0xFCFCFCFCUL)>>2);\
702 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
703 pixels+=line_size;\
704 block +=line_size;\
705 }\
706 pixels+=4-line_size*(h+1);\
707 block +=4-line_size*h;\
708 }\
709}\
710\
45553457
ZK
711CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
712CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
713CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
714CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
715CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
716CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
717CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
718CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 719
59fe111e
MN
720#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
721#endif
59fe111e
MN
722#define op_put(a, b) a = b
723
724PIXOP2(avg, op_avg)
725PIXOP2(put, op_put)
726#undef op_avg
727#undef op_put
728
de6d9b64
FB
729#define avg2(a,b) ((a+b+1)>>1)
730#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
731
073b013d 732
b3184779 733static void gmc1_c(UINT8 *dst, UINT8 *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
734{
735 const int A=(16-x16)*(16-y16);
736 const int B=( x16)*(16-y16);
737 const int C=(16-x16)*( y16);
738 const int D=( x16)*( y16);
739 int i;
44eb4951
MN
740
741 for(i=0; i<h; i++)
742 {
b3184779
MN
743 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
744 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
745 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
746 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
747 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
748 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
749 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
750 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
751 dst+= stride;
752 src+= stride;
44eb4951
MN
753 }
754}
755
073b013d
MN
756static void gmc_c(UINT8 *dst, UINT8 *src, int stride, int h, int ox, int oy,
757 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
758{
759 int y, vx, vy;
760 const int s= 1<<shift;
761
762 width--;
763 height--;
764
765 for(y=0; y<h; y++){
766 int x;
767
768 vx= ox;
769 vy= oy;
770 for(x=0; x<8; x++){ //XXX FIXME optimize
771 int src_x, src_y, frac_x, frac_y, index;
772
773 src_x= vx>>16;
774 src_y= vy>>16;
775 frac_x= src_x&(s-1);
776 frac_y= src_y&(s-1);
777 src_x>>=shift;
778 src_y>>=shift;
779
780 if((unsigned)src_x < width){
781 if((unsigned)src_y < height){
782 index= src_x + src_y*stride;
783 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
784 + src[index +1]* frac_x )*(s-frac_y)
785 + ( src[index+stride ]*(s-frac_x)
786 + src[index+stride+1]* frac_x )* frac_y
787 + r)>>(shift*2);
788 }else{
789 index= src_x + clip(src_y, 0, height)*stride;
790 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
791 + src[index +1]* frac_x )*s
792 + r)>>(shift*2);
793 }
794 }else{
795 if((unsigned)src_y < height){
796 index= clip(src_x, 0, width) + src_y*stride;
797 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
798 + src[index+stride ]* frac_y )*s
799 + r)>>(shift*2);
800 }else{
801 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
802 dst[y*stride + x]= src[index ];
803 }
804 }
805
806 vx+= dxx;
807 vy+= dyx;
808 }
809 ox += dxy;
810 oy += dyy;
811 }
812}
813
b3184779 814static inline void copy_block17(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951 815{
44eb4951
MN
816 int i;
817 for(i=0; i<h; i++)
818 {
b3184779
MN
819 ST32(dst , LD32(src ));
820 ST32(dst+4 , LD32(src+4 ));
821 ST32(dst+8 , LD32(src+8 ));
822 ST32(dst+12, LD32(src+12));
823 dst[16]= src[16];
44eb4951
MN
824 dst+=dstStride;
825 src+=srcStride;
826 }
827}
828
b3184779 829static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h)
44eb4951
MN
830{
831 int i;
b3184779 832 for(i=0; i<h; i++)
44eb4951 833 {
b3184779
MN
834 ST32(dst , LD32(src ));
835 ST32(dst+4 , LD32(src+4 ));
836 dst[8]= src[8];
44eb4951
MN
837 dst+=dstStride;
838 src+=srcStride;
839 }
840}
841
826f429a 842
b3184779
MN
843#define QPEL_MC(r, OPNAME, RND, OP) \
844static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
845 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
846 int i;\
847 for(i=0; i<h; i++)\
848 {\
849 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
850 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
851 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
852 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
853 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
854 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
855 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
856 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
857 dst+=dstStride;\
858 src+=srcStride;\
859 }\
44eb4951
MN
860}\
861\
db794953
MN
862static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride){\
863 const int w=8;\
b3184779
MN
864 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
865 int i;\
866 for(i=0; i<w; i++)\
867 {\
868 const int src0= src[0*srcStride];\
869 const int src1= src[1*srcStride];\
870 const int src2= src[2*srcStride];\
871 const int src3= src[3*srcStride];\
872 const int src4= src[4*srcStride];\
873 const int src5= src[5*srcStride];\
874 const int src6= src[6*srcStride];\
875 const int src7= src[7*srcStride];\
876 const int src8= src[8*srcStride];\
877 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
878 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
879 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
880 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
881 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
882 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
883 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
884 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
885 dst++;\
886 src++;\
887 }\
888}\
889\
890static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
891 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
892 int i;\
826f429a 893 \
b3184779
MN
894 for(i=0; i<h; i++)\
895 {\
896 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
897 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
898 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
899 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
900 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
901 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
902 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
903 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
904 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
905 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
906 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
907 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
908 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
909 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
910 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
911 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
912 dst+=dstStride;\
913 src+=srcStride;\
914 }\
915}\
916\
826f429a 917static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride){\
b3184779
MN
918 UINT8 *cm = cropTbl + MAX_NEG_CROP;\
919 int i;\
826f429a 920 const int w=16;\
b3184779
MN
921 for(i=0; i<w; i++)\
922 {\
923 const int src0= src[0*srcStride];\
924 const int src1= src[1*srcStride];\
925 const int src2= src[2*srcStride];\
926 const int src3= src[3*srcStride];\
927 const int src4= src[4*srcStride];\
928 const int src5= src[5*srcStride];\
929 const int src6= src[6*srcStride];\
930 const int src7= src[7*srcStride];\
931 const int src8= src[8*srcStride];\
932 const int src9= src[9*srcStride];\
933 const int src10= src[10*srcStride];\
934 const int src11= src[11*srcStride];\
935 const int src12= src[12*srcStride];\
936 const int src13= src[13*srcStride];\
937 const int src14= src[14*srcStride];\
938 const int src15= src[15*srcStride];\
939 const int src16= src[16*srcStride];\
940 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
941 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
942 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
943 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
944 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
945 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
946 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
947 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
948 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
949 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
950 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
951 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
952 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
953 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
954 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
955 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
956 dst++;\
957 src++;\
958 }\
959}\
960\
961static void OPNAME ## qpel8_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
45553457 962 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
963}\
964\
965static void OPNAME ## qpel8_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 966 UINT8 half[64];\
b3184779
MN
967 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
968 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
969}\
970\
b3184779
MN
971static void OPNAME ## qpel8_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
972 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
973}\
974\
b3184779 975static void OPNAME ## qpel8_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 976 UINT8 half[64];\
b3184779
MN
977 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
978 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
979}\
980\
b3184779
MN
981static void OPNAME ## qpel8_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
982 UINT8 full[16*9];\
44eb4951 983 UINT8 half[64];\
b3184779 984 copy_block9(full, src, 16, stride, 9);\
db794953 985 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 986 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
987}\
988\
b3184779
MN
989static void OPNAME ## qpel8_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
990 UINT8 full[16*9];\
991 copy_block9(full, src, 16, stride, 9);\
db794953 992 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
993}\
994\
b3184779
MN
995static void OPNAME ## qpel8_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
996 UINT8 full[16*9];\
44eb4951 997 UINT8 half[64];\
b3184779 998 copy_block9(full, src, 16, stride, 9);\
db794953 999 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1000 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1001}\
db794953 1002void ff_ ## OPNAME ## qpel8_mc11_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779 1003 UINT8 full[16*9];\
44eb4951 1004 UINT8 halfH[72];\
7ff037e9 1005 UINT8 halfV[64];\
44eb4951 1006 UINT8 halfHV[64];\
b3184779
MN
1007 copy_block9(full, src, 16, stride, 9);\
1008 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1009 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1010 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1011 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1012}\
db794953
MN
1013static void OPNAME ## qpel8_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1014 UINT8 full[16*9];\
1015 UINT8 halfH[72];\
1016 UINT8 halfHV[64];\
1017 copy_block9(full, src, 16, stride, 9);\
1018 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1019 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1020 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1021 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1022}\
1023void ff_ ## OPNAME ## qpel8_mc31_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779 1024 UINT8 full[16*9];\
44eb4951 1025 UINT8 halfH[72];\
7ff037e9 1026 UINT8 halfV[64];\
44eb4951 1027 UINT8 halfHV[64];\
b3184779
MN
1028 copy_block9(full, src, 16, stride, 9);\
1029 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1030 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1031 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1032 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1033}\
db794953
MN
1034static void OPNAME ## qpel8_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1035 UINT8 full[16*9];\
1036 UINT8 halfH[72];\
1037 UINT8 halfHV[64];\
1038 copy_block9(full, src, 16, stride, 9);\
1039 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1040 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1041 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1042 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1043}\
1044void ff_ ## OPNAME ## qpel8_mc13_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779 1045 UINT8 full[16*9];\
44eb4951 1046 UINT8 halfH[72];\
7ff037e9 1047 UINT8 halfV[64];\
44eb4951 1048 UINT8 halfHV[64];\
b3184779
MN
1049 copy_block9(full, src, 16, stride, 9);\
1050 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1051 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1052 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1053 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1054}\
db794953
MN
1055static void OPNAME ## qpel8_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1056 UINT8 full[16*9];\
1057 UINT8 halfH[72];\
1058 UINT8 halfHV[64];\
1059 copy_block9(full, src, 16, stride, 9);\
1060 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1061 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1062 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1063 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1064}\
1065void ff_ ## OPNAME ## qpel8_mc33_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779 1066 UINT8 full[16*9];\
44eb4951 1067 UINT8 halfH[72];\
7ff037e9 1068 UINT8 halfV[64];\
44eb4951 1069 UINT8 halfHV[64];\
b3184779
MN
1070 copy_block9(full, src, 16, stride, 9);\
1071 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1072 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1073 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1074 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1075}\
db794953
MN
1076static void OPNAME ## qpel8_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1077 UINT8 full[16*9];\
1078 UINT8 halfH[72];\
1079 UINT8 halfHV[64];\
1080 copy_block9(full, src, 16, stride, 9);\
1081 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1082 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1083 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1084 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1085}\
b3184779 1086static void OPNAME ## qpel8_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
1087 UINT8 halfH[72];\
1088 UINT8 halfHV[64];\
b3184779 1089 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1090 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1091 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1092}\
b3184779 1093static void OPNAME ## qpel8_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951
MN
1094 UINT8 halfH[72];\
1095 UINT8 halfHV[64];\
b3184779 1096 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1097 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1098 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1099}\
db794953 1100void ff_ ## OPNAME ## qpel8_mc12_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779 1101 UINT8 full[16*9];\
44eb4951 1102 UINT8 halfH[72];\
7ff037e9 1103 UINT8 halfV[64];\
44eb4951 1104 UINT8 halfHV[64];\
b3184779
MN
1105 copy_block9(full, src, 16, stride, 9);\
1106 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1107 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1108 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1109 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1110}\
db794953
MN
1111static void OPNAME ## qpel8_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1112 UINT8 full[16*9];\
1113 UINT8 halfH[72];\
1114 copy_block9(full, src, 16, stride, 9);\
1115 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1116 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1117 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1118}\
1119void ff_ ## OPNAME ## qpel8_mc32_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779 1120 UINT8 full[16*9];\
44eb4951 1121 UINT8 halfH[72];\
7ff037e9 1122 UINT8 halfV[64];\
44eb4951 1123 UINT8 halfHV[64];\
b3184779
MN
1124 copy_block9(full, src, 16, stride, 9);\
1125 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1126 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1127 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1128 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1129}\
db794953
MN
1130static void OPNAME ## qpel8_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1131 UINT8 full[16*9];\
1132 UINT8 halfH[72];\
1133 copy_block9(full, src, 16, stride, 9);\
1134 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1135 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1136 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1137}\
b3184779 1138static void OPNAME ## qpel8_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
44eb4951 1139 UINT8 halfH[72];\
b3184779 1140 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1141 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779
MN
1142}\
1143static void OPNAME ## qpel16_mc00_c (UINT8 *dst, UINT8 *src, int stride){\
45553457 1144 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1145}\
1146\
1147static void OPNAME ## qpel16_mc10_c(UINT8 *dst, UINT8 *src, int stride){\
1148 UINT8 half[256];\
1149 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1150 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1151}\
1152\
1153static void OPNAME ## qpel16_mc20_c(UINT8 *dst, UINT8 *src, int stride){\
1154 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1155}\
b3184779
MN
1156\
1157static void OPNAME ## qpel16_mc30_c(UINT8 *dst, UINT8 *src, int stride){\
1158 UINT8 half[256];\
1159 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1160 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1161}\
1162\
1163static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1164 UINT8 full[24*17];\
1165 UINT8 half[256];\
1166 copy_block17(full, src, 24, stride, 17);\
826f429a 1167 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1168 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1169}\
1170\
1171static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1172 UINT8 full[24*17];\
1173 copy_block17(full, src, 24, stride, 17);\
826f429a 1174 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1175}\
1176\
1177static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1178 UINT8 full[24*17];\
1179 UINT8 half[256];\
1180 copy_block17(full, src, 24, stride, 17);\
826f429a 1181 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1182 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1183}\
db794953 1184void ff_ ## OPNAME ## qpel16_mc11_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779
MN
1185 UINT8 full[24*17];\
1186 UINT8 halfH[272];\
1187 UINT8 halfV[256];\
1188 UINT8 halfHV[256];\
1189 copy_block17(full, src, 24, stride, 17);\
1190 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1191 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1192 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1193 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1194}\
db794953
MN
1195static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1196 UINT8 full[24*17];\
1197 UINT8 halfH[272];\
1198 UINT8 halfHV[256];\
1199 copy_block17(full, src, 24, stride, 17);\
1200 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1201 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1202 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1203 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1204}\
1205void ff_ ## OPNAME ## qpel16_mc31_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779
MN
1206 UINT8 full[24*17];\
1207 UINT8 halfH[272];\
1208 UINT8 halfV[256];\
1209 UINT8 halfHV[256];\
1210 copy_block17(full, src, 24, stride, 17);\
1211 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1212 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1213 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1214 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1215}\
db794953
MN
1216static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1217 UINT8 full[24*17];\
1218 UINT8 halfH[272];\
1219 UINT8 halfHV[256];\
1220 copy_block17(full, src, 24, stride, 17);\
1221 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1222 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1223 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1224 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1225}\
1226void ff_ ## OPNAME ## qpel16_mc13_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779
MN
1227 UINT8 full[24*17];\
1228 UINT8 halfH[272];\
1229 UINT8 halfV[256];\
1230 UINT8 halfHV[256];\
1231 copy_block17(full, src, 24, stride, 17);\
1232 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1233 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1234 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1235 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1236}\
db794953
MN
1237static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1238 UINT8 full[24*17];\
1239 UINT8 halfH[272];\
1240 UINT8 halfHV[256];\
1241 copy_block17(full, src, 24, stride, 17);\
1242 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1243 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1244 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1245 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1246}\
1247void ff_ ## OPNAME ## qpel16_mc33_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779
MN
1248 UINT8 full[24*17];\
1249 UINT8 halfH[272];\
1250 UINT8 halfV[256];\
1251 UINT8 halfHV[256];\
1252 copy_block17(full, src, 24, stride, 17);\
1253 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1254 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1255 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1256 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1257}\
db794953
MN
1258static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1259 UINT8 full[24*17];\
1260 UINT8 halfH[272];\
1261 UINT8 halfHV[256];\
1262 copy_block17(full, src, 24, stride, 17);\
1263 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1264 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1265 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1266 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1267}\
b3184779
MN
1268static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1269 UINT8 halfH[272];\
1270 UINT8 halfHV[256];\
1271 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1272 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1273 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1274}\
1275static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1276 UINT8 halfH[272];\
1277 UINT8 halfHV[256];\
1278 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1279 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1280 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1281}\
db794953 1282void ff_ ## OPNAME ## qpel16_mc12_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779
MN
1283 UINT8 full[24*17];\
1284 UINT8 halfH[272];\
1285 UINT8 halfV[256];\
1286 UINT8 halfHV[256];\
1287 copy_block17(full, src, 24, stride, 17);\
1288 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1289 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1290 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1291 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1292}\
db794953
MN
1293static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1294 UINT8 full[24*17];\
1295 UINT8 halfH[272];\
1296 copy_block17(full, src, 24, stride, 17);\
1297 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1298 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1299 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1300}\
1301void ff_ ## OPNAME ## qpel16_mc32_old_c(UINT8 *dst, UINT8 *src, int stride){\
b3184779
MN
1302 UINT8 full[24*17];\
1303 UINT8 halfH[272];\
1304 UINT8 halfV[256];\
1305 UINT8 halfHV[256];\
1306 copy_block17(full, src, 24, stride, 17);\
1307 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1308 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1309 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1310 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1311}\
db794953
MN
1312static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1313 UINT8 full[24*17];\
1314 UINT8 halfH[272];\
1315 copy_block17(full, src, 24, stride, 17);\
1316 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1317 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1318 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1319}\
b3184779
MN
1320static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1321 UINT8 halfH[272];\
1322 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1323 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 1324}
44eb4951 1325
b3184779
MN
1326#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1327#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1328#define op_put(a, b) a = cm[((b) + 16)>>5]
1329#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1330
1331QPEL_MC(0, put_ , _ , op_put)
1332QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1333QPEL_MC(0, avg_ , _ , op_avg)
1334//QPEL_MC(1, avg_no_rnd , _ , op_avg)
1335#undef op_avg
1336#undef op_avg_no_rnd
1337#undef op_put
1338#undef op_put_no_rnd
44eb4951 1339
1457ab52
MN
1340static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1341 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1342 int i;
1343
1344 for(i=0; i<h; i++){
1345 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1346 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1347 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1348 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1349 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1350 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1351 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1352 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1353 dst+=dstStride;
1354 src+=srcStride;
1355 }
1356}
1357
1358static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1359 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1360 int i;
1361
1362 for(i=0; i<w; i++){
1363 const int src_1= src[ -srcStride];
1364 const int src0 = src[0 ];
1365 const int src1 = src[ srcStride];
1366 const int src2 = src[2*srcStride];
1367 const int src3 = src[3*srcStride];
1368 const int src4 = src[4*srcStride];
1369 const int src5 = src[5*srcStride];
1370 const int src6 = src[6*srcStride];
1371 const int src7 = src[7*srcStride];
1372 const int src8 = src[8*srcStride];
1373 const int src9 = src[9*srcStride];
1374 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1375 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1376 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1377 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1378 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1379 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1380 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1381 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1382 src++;
1383 dst++;
1384 }
1385}
1386
1387static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1388 put_pixels8_c(dst, src, stride, 8);
1389}
1390
1391static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1392 uint8_t half[64];
1393 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1394 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1395}
1396
1397static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1398 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1399}
1400
1401static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1402 uint8_t half[64];
1403 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1404 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1405}
1406
1407static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1408 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1409}
1410
1411static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1412 uint8_t halfH[88];
1413 uint8_t halfV[64];
1414 uint8_t halfHV[64];
1415 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1416 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1417 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1418 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1419}
1420static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1421 uint8_t halfH[88];
1422 uint8_t halfV[64];
1423 uint8_t halfHV[64];
1424 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1425 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1426 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1427 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1428}
1429static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1430 uint8_t halfH[88];
1431 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1432 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1433}
1434
1435
1436static inline int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1437{
1438 int s, i;
1439
1440 s = 0;
ba6802de 1441 for(i=0;i<16;i++) {
de6d9b64
FB
1442 s += abs(pix1[0] - pix2[0]);
1443 s += abs(pix1[1] - pix2[1]);
1444 s += abs(pix1[2] - pix2[2]);
1445 s += abs(pix1[3] - pix2[3]);
1446 s += abs(pix1[4] - pix2[4]);
1447 s += abs(pix1[5] - pix2[5]);
1448 s += abs(pix1[6] - pix2[6]);
1449 s += abs(pix1[7] - pix2[7]);
1450 s += abs(pix1[8] - pix2[8]);
1451 s += abs(pix1[9] - pix2[9]);
1452 s += abs(pix1[10] - pix2[10]);
1453 s += abs(pix1[11] - pix2[11]);
1454 s += abs(pix1[12] - pix2[12]);
1455 s += abs(pix1[13] - pix2[13]);
1456 s += abs(pix1[14] - pix2[14]);
1457 s += abs(pix1[15] - pix2[15]);
1458 pix1 += line_size;
1459 pix2 += line_size;
1460 }
1461 return s;
1462}
1463
eb4b3dd3 1464static int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1465{
1466 int s, i;
1467
1468 s = 0;
ba6802de 1469 for(i=0;i<16;i++) {
de6d9b64
FB
1470 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1471 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1472 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1473 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1474 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1475 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1476 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1477 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1478 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1479 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1480 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1481 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1482 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1483 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1484 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1485 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1486 pix1 += line_size;
1487 pix2 += line_size;
1488 }
1489 return s;
1490}
1491
eb4b3dd3 1492static int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1493{
1494 int s, i;
1495 UINT8 *pix3 = pix2 + line_size;
1496
1497 s = 0;
ba6802de 1498 for(i=0;i<16;i++) {
de6d9b64
FB
1499 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1500 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1501 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1502 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1503 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1504 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1505 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1506 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1507 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1508 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1509 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1510 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1511 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1512 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1513 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1514 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1515 pix1 += line_size;
1516 pix2 += line_size;
1517 pix3 += line_size;
1518 }
1519 return s;
1520}
1521
eb4b3dd3 1522static int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1523{
1524 int s, i;
1525 UINT8 *pix3 = pix2 + line_size;
1526
1527 s = 0;
ba6802de 1528 for(i=0;i<16;i++) {
de6d9b64
FB
1529 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1530 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1531 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1532 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1533 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1534 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1535 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1536 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1537 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1538 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1539 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1540 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1541 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1542 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1543 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1544 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1545 pix1 += line_size;
1546 pix2 += line_size;
1547 pix3 += line_size;
1548 }
1549 return s;
1550}
1551
1457ab52 1552static inline int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1553{
1554 int s, i;
1555
1556 s = 0;
1557 for(i=0;i<8;i++) {
1558 s += abs(pix1[0] - pix2[0]);
1559 s += abs(pix1[1] - pix2[1]);
1560 s += abs(pix1[2] - pix2[2]);
1561 s += abs(pix1[3] - pix2[3]);
1562 s += abs(pix1[4] - pix2[4]);
1563 s += abs(pix1[5] - pix2[5]);
1564 s += abs(pix1[6] - pix2[6]);
1565 s += abs(pix1[7] - pix2[7]);
1566 pix1 += line_size;
1567 pix2 += line_size;
1568 }
1569 return s;
1570}
1571
eb4b3dd3 1572static int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1573{
1574 int s, i;
1575
1576 s = 0;
1577 for(i=0;i<8;i++) {
1578 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1579 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1580 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1581 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1582 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1583 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1584 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1585 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1586 pix1 += line_size;
1587 pix2 += line_size;
1588 }
1589 return s;
1590}
1591
eb4b3dd3 1592static int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1593{
1594 int s, i;
1595 UINT8 *pix3 = pix2 + line_size;
1596
1597 s = 0;
1598 for(i=0;i<8;i++) {
1599 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1600 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1601 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1602 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1603 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1604 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1605 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1606 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1607 pix1 += line_size;
1608 pix2 += line_size;
1609 pix3 += line_size;
1610 }
1611 return s;
1612}
1613
eb4b3dd3 1614static int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
ba6802de
MN
1615{
1616 int s, i;
1617 UINT8 *pix3 = pix2 + line_size;
1618
1619 s = 0;
1620 for(i=0;i<8;i++) {
1621 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1622 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1623 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1624 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1625 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1626 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1627 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1628 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1629 pix1 += line_size;
1630 pix2 += line_size;
1631 pix3 += line_size;
1632 }
1633 return s;
1634}
1635
1457ab52
MN
1636static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
1637 return pix_abs16x16_c(a,b,stride);
1638}
1639
1640static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
1641 return pix_abs8x8_c(a,b,stride);
1642}
1643
477ab036 1644void ff_block_permute(DCTELEM *block, UINT8 *permutation, const UINT8 *scantable, int last)
d962f6fd 1645{
7801d21d 1646 int i;
477ab036 1647 DCTELEM temp[64];
7801d21d
MN
1648
1649 if(last<=0) return;
9a7b310d 1650 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 1651
7801d21d
MN
1652 for(i=0; i<=last; i++){
1653 const int j= scantable[i];
1654 temp[j]= block[j];
1655 block[j]=0;
1656 }
1657
1658 for(i=0; i<=last; i++){
1659 const int j= scantable[i];
1660 const int perm_j= permutation[j];
1661 block[perm_j]= temp[j];
1662 }
d962f6fd 1663}
e0eac44e 1664
eb4b3dd3 1665static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
1666{
1667 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1668}
1669
11f18faf
MN
1670static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1671 int i;
d32ac509 1672 for(i=0; i+7<w; i+=8){
11f18faf
MN
1673 dst[i+0] += src[i+0];
1674 dst[i+1] += src[i+1];
1675 dst[i+2] += src[i+2];
1676 dst[i+3] += src[i+3];
1677 dst[i+4] += src[i+4];
1678 dst[i+5] += src[i+5];
1679 dst[i+6] += src[i+6];
1680 dst[i+7] += src[i+7];
1681 }
1682 for(; i<w; i++)
1683 dst[i+0] += src[i+0];
1684}
1685
1686static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1687 int i;
d32ac509 1688 for(i=0; i+7<w; i+=8){
11f18faf
MN
1689 dst[i+0] = src1[i+0]-src2[i+0];
1690 dst[i+1] = src1[i+1]-src2[i+1];
1691 dst[i+2] = src1[i+2]-src2[i+2];
1692 dst[i+3] = src1[i+3]-src2[i+3];
1693 dst[i+4] = src1[i+4]-src2[i+4];
1694 dst[i+5] = src1[i+5]-src2[i+5];
1695 dst[i+6] = src1[i+6]-src2[i+6];
1696 dst[i+7] = src1[i+7]-src2[i+7];
1697 }
1698 for(; i<w; i++)
1699 dst[i+0] = src1[i+0]-src2[i+0];
1700}
1701
1457ab52
MN
1702#define BUTTERFLY2(o1,o2,i1,i2) \
1703o1= (i1)+(i2);\
1704o2= (i1)-(i2);
1705
1706#define BUTTERFLY1(x,y) \
1707{\
1708 int a,b;\
1709 a= x;\
1710 b= y;\
1711 x= a+b;\
1712 y= a-b;\
1713}
1714
1715#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
1716
1717static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
1718 int i;
1719 int temp[64];
1720 int sum=0;
1721
1722 for(i=0; i<8; i++){
1723 //FIXME try pointer walks
1724 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1725 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1726 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1727 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1728
1729 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1730 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1731 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1732 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1733
1734 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1735 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1736 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1737 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1738 }
1739
1740 for(i=0; i<8; i++){
1741 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1742 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1743 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1744 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1745
1746 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1747 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1748 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1749 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1750
1751 sum +=
1752 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1753 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1754 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1755 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1756 }
1757#if 0
1758static int maxi=0;
1759if(sum>maxi){
1760 maxi=sum;
1761 printf("MAX:%d\n", maxi);
1762}
1763#endif
1764 return sum;
1765}
1766
1767static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
1768 int i;
1769 int temp[64];
1770 int sum=0;
1771//FIXME OOOPS ignore 0 term instead of mean mess
1772 for(i=0; i<8; i++){
1773 //FIXME try pointer walks
1774 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
1775 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
1776 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
1777 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
1778
1779 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1780 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1781 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1782 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1783
1784 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1785 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1786 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1787 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1788 }
1789
1790 for(i=0; i<8; i++){
1791 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1792 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1793 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1794 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1795
1796 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1797 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1798 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1799 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1800
1801 sum +=
1802 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1803 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1804 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1805 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1806 }
1807
1808 return sum;
1809}
1810
1811static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1812 MpegEncContext * const s= (MpegEncContext *)c;
1813 DCTELEM temp[64];
1814 int sum=0, i;
1815
1816 s->dsp.diff_pixels(temp, src1, src2, stride);
1817 s->fdct(temp);
1818
1819 for(i=0; i<64; i++)
1820 sum+= ABS(temp[i]);
1821
1822 return sum;
1823}
1824
1825void simple_idct(INT16 *block); //FIXME
1826
1827static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1828 MpegEncContext * const s= (MpegEncContext *)c;
1829 DCTELEM temp[64], bak[64];
1830 int sum=0, i;
1831
1832 s->mb_intra=0;
1833
1834 s->dsp.diff_pixels(temp, src1, src2, stride);
1835
1836 memcpy(bak, temp, 64*sizeof(DCTELEM));
1837
1838 s->dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1839 s->dct_unquantize(s, temp, 0, s->qscale);
1840 simple_idct(temp); //FIXME
1841
1842 for(i=0; i<64; i++)
1843 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
1844
1845 return sum;
1846}
1847
1848WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
1849WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
1850WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
1851
eb4b3dd3 1852void dsputil_init(DSPContext* c, unsigned mask)
e0eac44e 1853{
5abd509a 1854 static int init_done = 0;
d2975f8d 1855 int i;
e0eac44e 1856
5abd509a
ZK
1857 if (!init_done) {
1858 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1859 for(i=0;i<MAX_NEG_CROP;i++) {
1860 cropTbl[i] = 0;
1861 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1862 }
de6d9b64 1863
5abd509a
ZK
1864 for(i=0;i<512;i++) {
1865 squareTbl[i] = (i - 256) * (i - 256);
1866 }
92ddb692
ZK
1867
1868 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
1869
1870 init_done = 1;
de6d9b64
FB
1871 }
1872
eb4b3dd3
ZK
1873 c->get_pixels = get_pixels_c;
1874 c->diff_pixels = diff_pixels_c;
1875 c->put_pixels_clamped = put_pixels_clamped_c;
1876 c->add_pixels_clamped = add_pixels_clamped_c;
1877 c->gmc1 = gmc1_c;
1878 c->gmc = gmc_c;
1879 c->clear_blocks = clear_blocks_c;
1880 c->pix_sum = pix_sum_c;
1881 c->pix_norm1 = pix_norm1_c;
1457ab52
MN
1882 c->sse[0]= sse16_c;
1883 c->sse[1]= sse8_c;
eb4b3dd3 1884
45553457 1885 /* TODO [0] 16 [1] 8 */
eb4b3dd3
ZK
1886 c->pix_abs16x16 = pix_abs16x16_c;
1887 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
1888 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
1889 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1890 c->pix_abs8x8 = pix_abs8x8_c;
1891 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
1892 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
1893 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1894
45553457
ZK
1895#define dspfunc(PFX, IDX, NUM) \
1896 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
1897 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
1898 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
1899 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
1900
1901 dspfunc(put, 0, 16);
1902 dspfunc(put_no_rnd, 0, 16);
1903 dspfunc(put, 1, 8);
1904 dspfunc(put_no_rnd, 1, 8);
1905
1906 dspfunc(avg, 0, 16);
1907 dspfunc(avg_no_rnd, 0, 16);
1908 dspfunc(avg, 1, 8);
1909 dspfunc(avg_no_rnd, 1, 8);
1910#undef dspfunc
1911
1912#define dspfunc(PFX, IDX, NUM) \
1913 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
1914 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
1915 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
1916 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
1917 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
1918 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
1919 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
1920 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
1921 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
1922 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
1923 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
1924 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
1925 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
1926 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
1927 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
1928 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
1929
1930 dspfunc(put_qpel, 0, 16);
1931 dspfunc(put_no_rnd_qpel, 0, 16);
1932
1933 dspfunc(avg_qpel, 0, 16);
1934 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
1935
1936 dspfunc(put_qpel, 1, 8);
1937 dspfunc(put_no_rnd_qpel, 1, 8);
1938
1939 dspfunc(avg_qpel, 1, 8);
1940 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
1941#undef dspfunc
c9a2ebc4 1942
1457ab52
MN
1943 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
1944 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
1945 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
1946 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
1947 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
1948 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
1949 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
1950 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
1951
1952 c->hadamard8_diff[0]= hadamard8_diff16_c;
1953 c->hadamard8_diff[1]= hadamard8_diff_c;
1954 c->hadamard8_abs = hadamard8_abs_c;
1955
1956 c->dct_sad[0]= dct_sad16x16_c;
1957 c->dct_sad[1]= dct_sad8x8_c;
1958
1959 c->sad[0]= sad16x16_c;
1960 c->sad[1]= sad8x8_c;
1961
1962 c->quant_psnr[0]= quant_psnr16x16_c;
1963 c->quant_psnr[1]= quant_psnr8x8_c;
1964
11f18faf
MN
1965 c->add_bytes= add_bytes_c;
1966 c->diff_bytes= diff_bytes_c;
1967
980fc7b8 1968#ifdef HAVE_MMX
eb4b3dd3 1969 dsputil_init_mmx(c, mask);
34dfe896
ZK
1970 if (ff_bit_exact)
1971 {
1972 /* FIXME - AVCodec context should have flag for bitexact match */
1973 /* fprintf(stderr, "\n\n\nff_bit_exact %d\n\n\n\n", ff_bit_exact); */
1974 dsputil_set_bit_exact_mmx(c, mask);
1975 }
de6d9b64 1976#endif
3d03c0a2 1977#ifdef ARCH_ARMV4L
eb4b3dd3 1978 dsputil_init_armv4l(c, mask);
3d03c0a2 1979#endif
c34270f5 1980#ifdef HAVE_MLIB
eb4b3dd3 1981 dsputil_init_mlib(c, mask);
c34270f5 1982#endif
1e98dffb 1983#ifdef ARCH_ALPHA
eb4b3dd3 1984 dsputil_init_alpha(c, mask);
1e98dffb 1985#endif
59925ef2 1986#ifdef ARCH_POWERPC
eb4b3dd3 1987 dsputil_init_ppc(c, mask);
a43bd1d7 1988#endif
d46aba26 1989#ifdef HAVE_MMI
eb4b3dd3 1990 dsputil_init_mmi(c, mask);
d46aba26 1991#endif
de6d9b64 1992}
43f1708f 1993
57060b1e
FB
1994/* remove any non bit exact operation (testing purpose) */
1995void avcodec_set_bit_exact(void)
1996{
5596c60c 1997 ff_bit_exact=1;
57060b1e 1998#ifdef HAVE_MMX
34dfe896 1999// FIXME - better set_bit_exact
eb4b3dd3 2000// dsputil_set_bit_exact_mmx();
57060b1e
FB
2001#endif
2002}