added inlined put/add functions
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
d962f6fd 23#include "simple_idct.h"
de6d9b64 24
4af7bcc1 25void (*ff_idct)(DCTELEM *block);
03c94ede 26void (*av_fdct)(DCTELEM *block);
de6d9b64 27void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
9dbcbd92 28void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
de6d9b64
FB
29void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
30void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
44eb4951 31void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
649c00c9 32void (*clear_blocks)(DCTELEM *blocks);
de6d9b64
FB
33
34op_pixels_abs_func pix_abs16x16;
35op_pixels_abs_func pix_abs16x16_x2;
36op_pixels_abs_func pix_abs16x16_y2;
37op_pixels_abs_func pix_abs16x16_xy2;
38
ba6802de
MN
39op_pixels_abs_func pix_abs8x8;
40op_pixels_abs_func pix_abs8x8_x2;
41op_pixels_abs_func pix_abs8x8_y2;
42op_pixels_abs_func pix_abs8x8_xy2;
43
0cfa9713 44UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
45UINT32 squareTbl[512];
46
f0ca2e1b
ZK
47extern INT16 default_intra_matrix[64];
48extern INT16 default_non_intra_matrix[64];
49extern INT16 ff_mpeg4_default_intra_matrix[64];
50extern INT16 ff_mpeg4_default_non_intra_matrix[64];
e0eac44e
FB
51
52UINT8 zigzag_direct[64] = {
53 0, 1, 8, 16, 9, 2, 3, 10,
54 17, 24, 32, 25, 18, 11, 4, 5,
55 12, 19, 26, 33, 40, 48, 41, 34,
56 27, 20, 13, 6, 7, 14, 21, 28,
57 35, 42, 49, 56, 57, 50, 43, 36,
58 29, 22, 15, 23, 30, 37, 44, 51,
59 58, 59, 52, 45, 38, 31, 39, 46,
60 53, 60, 61, 54, 47, 55, 62, 63
61};
62
2f349de2
MN
63/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
64UINT16 __align8 inv_zigzag_direct16[64];
65
66/* not permutated zigzag_direct for MMX quantizer */
67UINT8 zigzag_direct_noperm[64];
68
e0eac44e
FB
69UINT8 ff_alternate_horizontal_scan[64] = {
70 0, 1, 2, 3, 8, 9, 16, 17,
71 10, 11, 4, 5, 6, 7, 15, 14,
72 13, 12, 19, 18, 24, 25, 32, 33,
73 26, 27, 20, 21, 22, 23, 28, 29,
74 30, 31, 34, 35, 40, 41, 48, 49,
75 42, 43, 36, 37, 38, 39, 44, 45,
76 46, 47, 50, 51, 56, 57, 58, 59,
77 52, 53, 54, 55, 60, 61, 62, 63,
78};
79
80UINT8 ff_alternate_vertical_scan[64] = {
81 0, 8, 16, 24, 1, 9, 2, 10,
82 17, 25, 32, 40, 48, 56, 57, 49,
83 41, 33, 26, 18, 3, 11, 4, 12,
84 19, 27, 34, 42, 50, 58, 35, 43,
85 51, 59, 20, 28, 5, 13, 6, 14,
86 21, 29, 36, 44, 52, 60, 37, 45,
87 53, 61, 22, 30, 7, 15, 23, 31,
88 38, 46, 54, 62, 39, 47, 55, 63,
89};
90
e4986da9
J
91#ifdef SIMPLE_IDCT
92
0a8d8945 93/* Input permutation for the simple_idct_mmx */
5a240838 94static UINT8 simple_mmx_permutation[64]={
0a8d8945
MN
95 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
96 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
97 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
98 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
99 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
100 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
101 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
102 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
5a240838 103};
e4986da9 104#endif
5a240838 105
2f349de2
MN
106/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
107UINT32 inverse[256]={
108 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
109 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
110 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
111 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
112 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
113 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
114 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
115 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
116 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
117 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
118 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
119 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
120 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
121 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
122 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
123 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
124 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
125 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
126 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
127 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
128 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
129 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
130 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
131 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
132 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
133 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
134 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
135 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
136 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
137 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
138 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
139 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
140};
141
badaf88e
MN
142/* used to skip zeros at the end */
143UINT8 zigzag_end[64];
144
5a240838
MN
145UINT8 permutation[64];
146//UINT8 invPermutation[64];
147
20695ec9 148static void build_zigzag_end(void)
badaf88e
MN
149{
150 int lastIndex;
151 int lastIndexAfterPerm=0;
152 for(lastIndex=0; lastIndex<64; lastIndex++)
153 {
154 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
155 lastIndexAfterPerm= zigzag_direct[lastIndex];
156 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
157 }
158}
159
de6d9b64
FB
160void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
161{
162 DCTELEM *p;
163 const UINT8 *pix;
164 int i;
165
166 /* read the pixels */
167 p = block;
168 pix = pixels;
169 for(i=0;i<8;i++) {
170 p[0] = pix[0];
171 p[1] = pix[1];
172 p[2] = pix[2];
173 p[3] = pix[3];
174 p[4] = pix[4];
175 p[5] = pix[5];
176 p[6] = pix[6];
177 p[7] = pix[7];
178 pix += line_size;
179 p += 8;
180 }
181}
182
9dbcbd92
MN
183void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
184 DCTELEM *p;
185 int i;
186
187 /* read the pixels */
188 p = block;
189 for(i=0;i<8;i++) {
190 p[0] = s1[0] - s2[0];
191 p[1] = s1[1] - s2[1];
192 p[2] = s1[2] - s2[2];
193 p[3] = s1[3] - s2[3];
194 p[4] = s1[4] - s2[4];
195 p[5] = s1[5] - s2[5];
196 p[6] = s1[6] - s2[6];
197 p[7] = s1[7] - s2[7];
198 s1 += stride;
199 s2 += stride;
200 p += 8;
201 }
202}
203
204
de6d9b64
FB
205void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
206{
207 const DCTELEM *p;
208 UINT8 *pix;
209 int i;
210 UINT8 *cm = cropTbl + MAX_NEG_CROP;
211
212 /* read the pixels */
213 p = block;
214 pix = pixels;
215 for(i=0;i<8;i++) {
216 pix[0] = cm[p[0]];
217 pix[1] = cm[p[1]];
218 pix[2] = cm[p[2]];
219 pix[3] = cm[p[3]];
220 pix[4] = cm[p[4]];
221 pix[5] = cm[p[5]];
222 pix[6] = cm[p[6]];
223 pix[7] = cm[p[7]];
224 pix += line_size;
225 p += 8;
226 }
227}
228
229void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
230{
231 const DCTELEM *p;
232 UINT8 *pix;
233 int i;
234 UINT8 *cm = cropTbl + MAX_NEG_CROP;
235
236 /* read the pixels */
237 p = block;
238 pix = pixels;
239 for(i=0;i<8;i++) {
240 pix[0] = cm[pix[0] + p[0]];
241 pix[1] = cm[pix[1] + p[1]];
242 pix[2] = cm[pix[2] + p[2]];
243 pix[3] = cm[pix[3] + p[3]];
244 pix[4] = cm[pix[4] + p[4]];
245 pix[5] = cm[pix[5] + p[5]];
246 pix[6] = cm[pix[6] + p[6]];
247 pix[7] = cm[pix[7] + p[7]];
248 pix += line_size;
249 p += 8;
250 }
251}
252
10fc8424
MN
253#ifdef __GNUC__
254
255struct unaligned_64 { uint64_t l; } __attribute__((packed));
256struct unaligned_32 { uint32_t l; } __attribute__((packed));
257
258#define LD32(a) (((const struct unaligned_32 *) (a))->l)
259#define LD64(a) (((const struct unaligned_64 *) (a))->l)
260
261#else /* __GNUC__ */
59fe111e
MN
262
263#define LD32(a) (*((uint32_t*)(a)))
264#define LD64(a) (*((uint64_t*)(a)))
265
10fc8424
MN
266#endif /* !__GNUC__ */
267
59fe111e
MN
268#if 0
269
270#define PIXOP2(OPNAME, OP) \
271void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
272{\
273 int i;\
274 for(i=0; i<h; i++){\
275 OP(*((uint64_t*)block), LD64(pixels));\
276 pixels+=line_size;\
277 block +=line_size;\
278 }\
279}\
280\
281void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
282{\
283 int i;\
284 for(i=0; i<h; i++){\
285 const uint64_t a= LD64(pixels );\
286 const uint64_t b= LD64(pixels+1);\
287 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
288 pixels+=line_size;\
289 block +=line_size;\
290 }\
291}\
292\
293void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
294{\
295 int i;\
296 for(i=0; i<h; i++){\
297 const uint64_t a= LD64(pixels );\
298 const uint64_t b= LD64(pixels+1);\
299 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
300 pixels+=line_size;\
301 block +=line_size;\
302 }\
303}\
304\
305void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
306{\
307 int i;\
308 for(i=0; i<h; i++){\
309 const uint64_t a= LD64(pixels );\
310 const uint64_t b= LD64(pixels+line_size);\
311 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
312 pixels+=line_size;\
313 block +=line_size;\
314 }\
315}\
316\
317void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
318{\
319 int i;\
320 for(i=0; i<h; i++){\
321 const uint64_t a= LD64(pixels );\
322 const uint64_t b= LD64(pixels+line_size);\
323 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
324 pixels+=line_size;\
325 block +=line_size;\
326 }\
327}\
328\
329void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
330{\
331 int i;\
332 const uint64_t a= LD64(pixels );\
333 const uint64_t b= LD64(pixels+1);\
334 uint64_t l0= (a&0x0303030303030303ULL)\
335 + (b&0x0303030303030303ULL)\
336 + 0x0202020202020202ULL;\
337 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
338 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
339 uint64_t l1,h1;\
340\
341 pixels+=line_size;\
342 for(i=0; i<h; i+=2){\
343 uint64_t a= LD64(pixels );\
344 uint64_t b= LD64(pixels+1);\
345 l1= (a&0x0303030303030303ULL)\
346 + (b&0x0303030303030303ULL);\
347 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
348 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
349 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
350 pixels+=line_size;\
351 block +=line_size;\
352 a= LD64(pixels );\
353 b= LD64(pixels+1);\
354 l0= (a&0x0303030303030303ULL)\
355 + (b&0x0303030303030303ULL)\
356 + 0x0202020202020202ULL;\
357 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
358 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
359 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
360 pixels+=line_size;\
361 block +=line_size;\
362 }\
363}\
364\
365void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
366{\
367 int i;\
368 const uint64_t a= LD64(pixels );\
369 const uint64_t b= LD64(pixels+1);\
370 uint64_t l0= (a&0x0303030303030303ULL)\
371 + (b&0x0303030303030303ULL)\
372 + 0x0101010101010101ULL;\
373 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
374 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
375 uint64_t l1,h1;\
376\
377 pixels+=line_size;\
378 for(i=0; i<h; i+=2){\
379 uint64_t a= LD64(pixels );\
380 uint64_t b= LD64(pixels+1);\
381 l1= (a&0x0303030303030303ULL)\
382 + (b&0x0303030303030303ULL);\
383 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
384 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
385 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
386 pixels+=line_size;\
387 block +=line_size;\
388 a= LD64(pixels );\
389 b= LD64(pixels+1);\
390 l0= (a&0x0303030303030303ULL)\
391 + (b&0x0303030303030303ULL)\
392 + 0x0101010101010101ULL;\
393 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
394 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
395 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
396 pixels+=line_size;\
397 block +=line_size;\
398 }\
399}\
400\
401void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
402 OPNAME ## _pixels,\
403 OPNAME ## _pixels_x2,\
404 OPNAME ## _pixels_y2,\
405 OPNAME ## _pixels_xy2,\
406};\
407\
408void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
409 OPNAME ## _pixels,\
410 OPNAME ## _no_rnd_pixels_x2,\
411 OPNAME ## _no_rnd_pixels_y2,\
412 OPNAME ## _no_rnd_pixels_xy2,\
413};
414
415#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
416#else // 64 bit variant
417
418#define PIXOP2(OPNAME, OP) \
419void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
420{\
421 int i;\
422 for(i=0; i<h; i++){\
423 OP(*((uint32_t*)(block )), LD32(pixels ));\
424 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
425 pixels+=line_size;\
426 block +=line_size;\
427 }\
428}\
429\
430void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
431{\
432 int i;\
433 for(i=0; i<h; i++){\
434 int j;\
435 for(j=0; j<2; j++){\
436 const uint32_t a= LD32(pixels );\
437 const uint32_t b= LD32(pixels+1);\
438 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
439 pixels+=4;\
440 block +=4;\
441 }\
442 pixels+=line_size-8;\
443 block +=line_size-8;\
444 }\
445}\
446\
447void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
448{\
449 int i;\
450 for(i=0; i<h; i++){\
451 int j;\
452 for(j=0; j<2; j++){\
453 const uint32_t a= LD32(pixels );\
454 const uint32_t b= LD32(pixels+1);\
455 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
456 pixels+=4;\
457 block +=4;\
458 }\
459 pixels+=line_size-8;\
460 block +=line_size-8;\
461 }\
462}\
463\
464void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
465{\
466 int i;\
467 for(i=0; i<h; i++){\
468 int j;\
469 for(j=0; j<2; j++){\
470 const uint32_t a= LD32(pixels );\
471 const uint32_t b= LD32(pixels+line_size);\
472 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
473 pixels+=4;\
474 block +=4;\
475 }\
476 pixels+=line_size-8;\
477 block +=line_size-8;\
478 }\
479}\
480\
481void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
482{\
483 int i;\
484 for(i=0; i<h; i++){\
485 int j;\
486 for(j=0; j<2; j++){\
487 const uint32_t a= LD32(pixels );\
488 const uint32_t b= LD32(pixels+line_size);\
489 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
490 pixels+=4;\
491 block +=4;\
492 }\
493 pixels+=line_size-8;\
494 block +=line_size-8;\
495 }\
496}\
497\
498void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
499{\
500 int j;\
501 for(j=0; j<2; j++){\
502 int i;\
503 const uint32_t a= LD32(pixels );\
504 const uint32_t b= LD32(pixels+1);\
505 uint32_t l0= (a&0x03030303UL)\
506 + (b&0x03030303UL)\
507 + 0x02020202UL;\
508 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
509 + ((b&0xFCFCFCFCUL)>>2);\
510 uint32_t l1,h1;\
511\
512 pixels+=line_size;\
513 for(i=0; i<h; i+=2){\
514 uint32_t a= LD32(pixels );\
515 uint32_t b= LD32(pixels+1);\
516 l1= (a&0x03030303UL)\
517 + (b&0x03030303UL);\
518 h1= ((a&0xFCFCFCFCUL)>>2)\
519 + ((b&0xFCFCFCFCUL)>>2);\
520 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
521 pixels+=line_size;\
522 block +=line_size;\
523 a= LD32(pixels );\
524 b= LD32(pixels+1);\
525 l0= (a&0x03030303UL)\
526 + (b&0x03030303UL)\
527 + 0x02020202UL;\
528 h0= ((a&0xFCFCFCFCUL)>>2)\
529 + ((b&0xFCFCFCFCUL)>>2);\
530 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
531 pixels+=line_size;\
532 block +=line_size;\
533 }\
534 pixels+=4-line_size*(h+1);\
535 block +=4-line_size*h;\
536 }\
537}\
538\
539void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
540{\
541 int j;\
542 for(j=0; j<2; j++){\
543 int i;\
544 const uint32_t a= LD32(pixels );\
545 const uint32_t b= LD32(pixels+1);\
546 uint32_t l0= (a&0x03030303UL)\
547 + (b&0x03030303UL)\
548 + 0x01010101UL;\
549 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
550 + ((b&0xFCFCFCFCUL)>>2);\
551 uint32_t l1,h1;\
552\
553 pixels+=line_size;\
554 for(i=0; i<h; i+=2){\
555 uint32_t a= LD32(pixels );\
556 uint32_t b= LD32(pixels+1);\
557 l1= (a&0x03030303UL)\
558 + (b&0x03030303UL);\
559 h1= ((a&0xFCFCFCFCUL)>>2)\
560 + ((b&0xFCFCFCFCUL)>>2);\
561 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
562 pixels+=line_size;\
563 block +=line_size;\
564 a= LD32(pixels );\
565 b= LD32(pixels+1);\
566 l0= (a&0x03030303UL)\
567 + (b&0x03030303UL)\
568 + 0x01010101UL;\
569 h0= ((a&0xFCFCFCFCUL)>>2)\
570 + ((b&0xFCFCFCFCUL)>>2);\
571 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
572 pixels+=line_size;\
573 block +=line_size;\
574 }\
575 pixels+=4-line_size*(h+1);\
576 block +=4-line_size*h;\
577 }\
578}\
579\
580void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
581 OPNAME ## _pixels,\
582 OPNAME ## _pixels_x2,\
583 OPNAME ## _pixels_y2,\
584 OPNAME ## _pixels_xy2,\
585};\
586\
587void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
588 OPNAME ## _pixels,\
589 OPNAME ## _no_rnd_pixels_x2,\
590 OPNAME ## _no_rnd_pixels_y2,\
591 OPNAME ## _no_rnd_pixels_xy2,\
592};
593#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
594#endif
595
596#define op_put(a, b) a = b
597
598PIXOP2(avg, op_avg)
599PIXOP2(put, op_put)
600#undef op_avg
601#undef op_put
602
57060b1e 603#if 0
59fe111e 604/* FIXME this stuff could be removed as its ot really used anymore */
de6d9b64
FB
605#define PIXOP(BTYPE, OPNAME, OP, INCR) \
606 \
607static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
608{ \
609 BTYPE *p; \
610 const UINT8 *pix; \
611 \
612 p = block; \
613 pix = pixels; \
614 do { \
615 OP(p[0], pix[0]); \
616 OP(p[1], pix[1]); \
617 OP(p[2], pix[2]); \
618 OP(p[3], pix[3]); \
619 OP(p[4], pix[4]); \
620 OP(p[5], pix[5]); \
621 OP(p[6], pix[6]); \
622 OP(p[7], pix[7]); \
623 pix += line_size; \
624 p += INCR; \
625 } while (--h);; \
626} \
627 \
628static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
629{ \
630 BTYPE *p; \
631 const UINT8 *pix; \
632 \
633 p = block; \
634 pix = pixels; \
635 do { \
636 OP(p[0], avg2(pix[0], pix[1])); \
637 OP(p[1], avg2(pix[1], pix[2])); \
638 OP(p[2], avg2(pix[2], pix[3])); \
639 OP(p[3], avg2(pix[3], pix[4])); \
640 OP(p[4], avg2(pix[4], pix[5])); \
641 OP(p[5], avg2(pix[5], pix[6])); \
642 OP(p[6], avg2(pix[6], pix[7])); \
643 OP(p[7], avg2(pix[7], pix[8])); \
644 pix += line_size; \
645 p += INCR; \
646 } while (--h); \
647} \
648 \
649static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
650{ \
651 BTYPE *p; \
652 const UINT8 *pix; \
653 const UINT8 *pix1; \
654 \
655 p = block; \
656 pix = pixels; \
657 pix1 = pixels + line_size; \
658 do { \
659 OP(p[0], avg2(pix[0], pix1[0])); \
660 OP(p[1], avg2(pix[1], pix1[1])); \
661 OP(p[2], avg2(pix[2], pix1[2])); \
662 OP(p[3], avg2(pix[3], pix1[3])); \
663 OP(p[4], avg2(pix[4], pix1[4])); \
664 OP(p[5], avg2(pix[5], pix1[5])); \
665 OP(p[6], avg2(pix[6], pix1[6])); \
666 OP(p[7], avg2(pix[7], pix1[7])); \
667 pix += line_size; \
668 pix1 += line_size; \
669 p += INCR; \
670 } while(--h); \
671} \
672 \
673static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
674{ \
675 BTYPE *p; \
676 const UINT8 *pix; \
677 const UINT8 *pix1; \
678 \
679 p = block; \
680 pix = pixels; \
681 pix1 = pixels + line_size; \
682 do { \
683 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
684 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
685 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
686 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
687 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
688 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
689 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
690 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
691 pix += line_size; \
692 pix1 += line_size; \
693 p += INCR; \
694 } while(--h); \
695} \
696 \
697void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
698 OPNAME ## _pixels, \
699 OPNAME ## _pixels_x2, \
700 OPNAME ## _pixels_y2, \
701 OPNAME ## _pixels_xy2, \
702};
703
de6d9b64
FB
704/* rounding primitives */
705#define avg2(a,b) ((a+b+1)>>1)
706#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
707
de6d9b64
FB
708#define op_avg(a, b) a = avg2(a, b)
709#define op_sub(a, b) a -= b
710
de6d9b64
FB
711PIXOP(DCTELEM, sub, op_sub, 8)
712
713/* not rounding primitives */
714#undef avg2
715#undef avg4
716#define avg2(a,b) ((a+b)>>1)
717#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
718
de6d9b64
FB
719/* motion estimation */
720
721#undef avg2
722#undef avg4
57060b1e
FB
723#endif
724
de6d9b64
FB
725#define avg2(a,b) ((a+b+1)>>1)
726#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
727
44eb4951
MN
728static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
729{
730 const int A=(16-x16)*(16-y16);
731 const int B=( x16)*(16-y16);
732 const int C=(16-x16)*( y16);
733 const int D=( x16)*( y16);
734 int i;
735 rounder= 128 - rounder;
736
737 for(i=0; i<h; i++)
738 {
739 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
740 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
741 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
742 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
743 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
744 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
745 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
746 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
747 dst+= srcStride;
748 src+= srcStride;
749 }
750}
751
752static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
753{
754 UINT8 *cm = cropTbl + MAX_NEG_CROP;
755 int i;
756 for(i=0; i<h; i++)
757 {
ba6802de
MN
758 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
759 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
760 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
761 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
762 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
763 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
764 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
765 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
44eb4951
MN
766 dst+=dstStride;
767 src+=srcStride;
768 }
769}
770
771static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
772{
773 UINT8 *cm = cropTbl + MAX_NEG_CROP;
774 int i;
775 for(i=0; i<w; i++)
776 {
777 const int src0= src[0*srcStride];
778 const int src1= src[1*srcStride];
779 const int src2= src[2*srcStride];
780 const int src3= src[3*srcStride];
781 const int src4= src[4*srcStride];
782 const int src5= src[5*srcStride];
783 const int src6= src[6*srcStride];
784 const int src7= src[7*srcStride];
785 const int src8= src[8*srcStride];
ba6802de
MN
786 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
787 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
788 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
789 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
790 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
791 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
792 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
793 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
44eb4951
MN
794 dst++;
795 src++;
796 }
797}
798
799static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
800{
801 int i;
802 for(i=0; i<8; i++)
803 {
804 dst[0]= src[0];
805 dst[1]= src[1];
806 dst[2]= src[2];
807 dst[3]= src[3];
808 dst[4]= src[4];
809 dst[5]= src[5];
810 dst[6]= src[6];
811 dst[7]= src[7];
812 dst+=dstStride;
813 src+=srcStride;
814 }
815}
816
817static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
818{
819 int i;
820 for(i=0; i<8; i++)
821 {
822 dst[0]= (src1[0] + src2[0] + r)>>1;
823 dst[1]= (src1[1] + src2[1] + r)>>1;
824 dst[2]= (src1[2] + src2[2] + r)>>1;
825 dst[3]= (src1[3] + src2[3] + r)>>1;
826 dst[4]= (src1[4] + src2[4] + r)>>1;
827 dst[5]= (src1[5] + src2[5] + r)>>1;
828 dst[6]= (src1[6] + src2[6] + r)>>1;
829 dst[7]= (src1[7] + src2[7] + r)>>1;
830 dst+=dstStride;
831 src1+=srcStride;
832 src2+=8;
833 }
834}
835
836static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
837{
838 int i;
839 for(i=0; i<8; i++)
840 {
841 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
842 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
843 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
844 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
845 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
846 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
847 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
848 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
849 dst+=dstStride;
850 src1+=srcStride;
851 src2+=8;
7ff037e9 852 src3+=8;
44eb4951
MN
853 src4+=8;
854 }
855}
856
857#define QPEL_MC(r, name) \
858static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
859{\
860 put_block(dst, src, dstStride, srcStride);\
861}\
862\
863static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
864{\
865 UINT8 half[64];\
ba6802de 866 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
867 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
868}\
869\
870static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
871{\
ba6802de 872 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
44eb4951
MN
873}\
874\
875static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
876{\
877 UINT8 half[64];\
ba6802de 878 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
879 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
880}\
881\
882static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
883{\
884 UINT8 half[64];\
ba6802de 885 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
886 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
887}\
888\
889static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
890{\
ba6802de 891 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
44eb4951
MN
892}\
893\
894static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
895{\
896 UINT8 half[64];\
ba6802de 897 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
898 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
899}\
900static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
901{\
902 UINT8 halfH[72];\
7ff037e9 903 UINT8 halfV[64];\
44eb4951 904 UINT8 halfHV[64];\
ba6802de
MN
905 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
906 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
907 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
908 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
909}\
910static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
911{\
912 UINT8 halfH[72];\
7ff037e9 913 UINT8 halfV[64];\
44eb4951 914 UINT8 halfHV[64];\
ba6802de
MN
915 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
916 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
917 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
918 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
919}\
920static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
921{\
922 UINT8 halfH[72];\
7ff037e9 923 UINT8 halfV[64];\
44eb4951 924 UINT8 halfHV[64];\
ba6802de
MN
925 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
926 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
927 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 928 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
44eb4951
MN
929}\
930static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
931{\
932 UINT8 halfH[72];\
7ff037e9 933 UINT8 halfV[64];\
44eb4951 934 UINT8 halfHV[64];\
ba6802de
MN
935 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
936 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
937 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 938 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
44eb4951
MN
939}\
940static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
941{\
942 UINT8 halfH[72];\
943 UINT8 halfHV[64];\
ba6802de
MN
944 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
945 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
946 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
947}\
948static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
949{\
950 UINT8 halfH[72];\
951 UINT8 halfHV[64];\
ba6802de
MN
952 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
953 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
954 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
955}\
956static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
957{\
958 UINT8 halfH[72];\
7ff037e9 959 UINT8 halfV[64];\
44eb4951 960 UINT8 halfHV[64];\
ba6802de
MN
961 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
962 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
963 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 964 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
44eb4951
MN
965}\
966static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
967{\
968 UINT8 halfH[72];\
7ff037e9 969 UINT8 halfV[64];\
44eb4951 970 UINT8 halfHV[64];\
ba6802de
MN
971 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
972 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
973 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 974 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
44eb4951
MN
975}\
976static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
977{\
978 UINT8 halfH[72];\
ba6802de
MN
979 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
980 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
44eb4951
MN
981}\
982qpel_mc_func qpel_mc ## name ## _tab[16]={ \
983 qpel_mc00_c ## name, \
984 qpel_mc10_c ## name, \
985 qpel_mc20_c ## name, \
986 qpel_mc30_c ## name, \
987 qpel_mc01_c ## name, \
988 qpel_mc11_c ## name, \
989 qpel_mc21_c ## name, \
990 qpel_mc31_c ## name, \
991 qpel_mc02_c ## name, \
992 qpel_mc12_c ## name, \
993 qpel_mc22_c ## name, \
994 qpel_mc32_c ## name, \
995 qpel_mc03_c ## name, \
996 qpel_mc13_c ## name, \
997 qpel_mc23_c ## name, \
998 qpel_mc33_c ## name, \
999};
1000
1001QPEL_MC(0, _rnd)
1002QPEL_MC(1, _no_rnd)
1003
ba6802de 1004int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1005{
1006 int s, i;
1007
1008 s = 0;
ba6802de 1009 for(i=0;i<16;i++) {
de6d9b64
FB
1010 s += abs(pix1[0] - pix2[0]);
1011 s += abs(pix1[1] - pix2[1]);
1012 s += abs(pix1[2] - pix2[2]);
1013 s += abs(pix1[3] - pix2[3]);
1014 s += abs(pix1[4] - pix2[4]);
1015 s += abs(pix1[5] - pix2[5]);
1016 s += abs(pix1[6] - pix2[6]);
1017 s += abs(pix1[7] - pix2[7]);
1018 s += abs(pix1[8] - pix2[8]);
1019 s += abs(pix1[9] - pix2[9]);
1020 s += abs(pix1[10] - pix2[10]);
1021 s += abs(pix1[11] - pix2[11]);
1022 s += abs(pix1[12] - pix2[12]);
1023 s += abs(pix1[13] - pix2[13]);
1024 s += abs(pix1[14] - pix2[14]);
1025 s += abs(pix1[15] - pix2[15]);
1026 pix1 += line_size;
1027 pix2 += line_size;
1028 }
1029 return s;
1030}
1031
ba6802de 1032int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1033{
1034 int s, i;
1035
1036 s = 0;
ba6802de 1037 for(i=0;i<16;i++) {
de6d9b64
FB
1038 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1039 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1040 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1041 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1042 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1043 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1044 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1045 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1046 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1047 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1048 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1049 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1050 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1051 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1052 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1053 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1054 pix1 += line_size;
1055 pix2 += line_size;
1056 }
1057 return s;
1058}
1059
ba6802de 1060int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1061{
1062 int s, i;
1063 UINT8 *pix3 = pix2 + line_size;
1064
1065 s = 0;
ba6802de 1066 for(i=0;i<16;i++) {
de6d9b64
FB
1067 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1068 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1069 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1070 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1071 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1072 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1073 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1074 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1075 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1076 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1077 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1078 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1079 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1080 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1081 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1082 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1083 pix1 += line_size;
1084 pix2 += line_size;
1085 pix3 += line_size;
1086 }
1087 return s;
1088}
1089
ba6802de 1090int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1091{
1092 int s, i;
1093 UINT8 *pix3 = pix2 + line_size;
1094
1095 s = 0;
ba6802de 1096 for(i=0;i<16;i++) {
de6d9b64
FB
1097 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1098 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1099 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1100 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1101 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1102 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1103 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1104 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1105 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1106 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1107 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1108 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1109 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1110 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1111 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1112 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1113 pix1 += line_size;
1114 pix2 += line_size;
1115 pix3 += line_size;
1116 }
1117 return s;
1118}
1119
ba6802de
MN
1120int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1121{
1122 int s, i;
1123
1124 s = 0;
1125 for(i=0;i<8;i++) {
1126 s += abs(pix1[0] - pix2[0]);
1127 s += abs(pix1[1] - pix2[1]);
1128 s += abs(pix1[2] - pix2[2]);
1129 s += abs(pix1[3] - pix2[3]);
1130 s += abs(pix1[4] - pix2[4]);
1131 s += abs(pix1[5] - pix2[5]);
1132 s += abs(pix1[6] - pix2[6]);
1133 s += abs(pix1[7] - pix2[7]);
1134 pix1 += line_size;
1135 pix2 += line_size;
1136 }
1137 return s;
1138}
1139
1140int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1141{
1142 int s, i;
1143
1144 s = 0;
1145 for(i=0;i<8;i++) {
1146 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1147 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1148 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1149 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1150 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1151 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1152 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1153 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1154 pix1 += line_size;
1155 pix2 += line_size;
1156 }
1157 return s;
1158}
1159
1160int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1161{
1162 int s, i;
1163 UINT8 *pix3 = pix2 + line_size;
1164
1165 s = 0;
1166 for(i=0;i<8;i++) {
1167 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1168 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1169 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1170 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1171 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1172 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1173 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1174 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1175 pix1 += line_size;
1176 pix2 += line_size;
1177 pix3 += line_size;
1178 }
1179 return s;
1180}
1181
1182int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1183{
1184 int s, i;
1185 UINT8 *pix3 = pix2 + line_size;
1186
1187 s = 0;
1188 for(i=0;i<8;i++) {
1189 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1190 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1191 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1192 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1193 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1194 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1195 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1196 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1197 pix1 += line_size;
1198 pix2 += line_size;
1199 pix3 += line_size;
1200 }
1201 return s;
1202}
1203
e0eac44e
FB
1204/* permute block according so that it corresponds to the MMX idct
1205 order */
d962f6fd 1206#ifdef SIMPLE_IDCT
5a240838 1207 /* general permutation, but perhaps slightly slower */
d962f6fd
A
1208void block_permute(INT16 *block)
1209{
1210 int i;
1211 INT16 temp[64];
1212
d962f6fd
A
1213 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1214
1215 for(i=0; i<64; i++) block[i] = temp[i];
d962f6fd 1216}
d962f6fd
A
1217#else
1218
e0eac44e 1219void block_permute(INT16 *block)
de6d9b64 1220{
e0eac44e 1221 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
de6d9b64
FB
1222 int i;
1223
e0eac44e
FB
1224 for(i=0;i<8;i++) {
1225 tmp1 = block[1];
1226 tmp2 = block[2];
1227 tmp3 = block[3];
1228 tmp4 = block[4];
1229 tmp5 = block[5];
1230 tmp6 = block[6];
1231 block[1] = tmp2;
1232 block[2] = tmp4;
1233 block[3] = tmp6;
1234 block[4] = tmp1;
1235 block[5] = tmp3;
1236 block[6] = tmp5;
1237 block += 8;
1238 }
1239}
d962f6fd 1240#endif
e0eac44e 1241
649c00c9
MN
1242void clear_blocks_c(DCTELEM *blocks)
1243{
1244 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1245}
1246
e0eac44e
FB
1247void dsputil_init(void)
1248{
1249 int i, j;
c34270f5 1250 int use_permuted_idct;
e0eac44e 1251
de6d9b64
FB
1252 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1253 for(i=0;i<MAX_NEG_CROP;i++) {
1254 cropTbl[i] = 0;
1255 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1256 }
1257
1258 for(i=0;i<512;i++) {
1259 squareTbl[i] = (i - 256) * (i - 256);
1260 }
1261
d962f6fd
A
1262#ifdef SIMPLE_IDCT
1263 ff_idct = simple_idct;
1264#else
4af7bcc1 1265 ff_idct = j_rev_dct;
d962f6fd 1266#endif
de6d9b64 1267 get_pixels = get_pixels_c;
9dbcbd92 1268 diff_pixels = diff_pixels_c;
de6d9b64
FB
1269 put_pixels_clamped = put_pixels_clamped_c;
1270 add_pixels_clamped = add_pixels_clamped_c;
44eb4951 1271 gmc1= gmc1_c;
649c00c9 1272 clear_blocks= clear_blocks_c;
de6d9b64 1273
ba6802de
MN
1274 pix_abs16x16 = pix_abs16x16_c;
1275 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1276 pix_abs16x16_y2 = pix_abs16x16_y2_c;
de6d9b64 1277 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
ba6802de
MN
1278 pix_abs8x8 = pix_abs8x8_c;
1279 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1280 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1281 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
03c94ede 1282 av_fdct = fdct_ifast;
de6d9b64 1283
c34270f5 1284 use_permuted_idct = 1;
e0eac44e 1285
980fc7b8 1286#ifdef HAVE_MMX
de6d9b64
FB
1287 dsputil_init_mmx();
1288#endif
3d03c0a2
FB
1289#ifdef ARCH_ARMV4L
1290 dsputil_init_armv4l();
1291#endif
c34270f5
FB
1292#ifdef HAVE_MLIB
1293 dsputil_init_mlib();
1294 use_permuted_idct = 0;
1295#endif
1e98dffb
NK
1296#ifdef ARCH_ALPHA
1297 dsputil_init_alpha();
1298 use_permuted_idct = 0;
1299#endif
c34270f5 1300
d962f6fd
A
1301#ifdef SIMPLE_IDCT
1302 if(ff_idct == simple_idct) use_permuted_idct=0;
1303#endif
1304
5a240838
MN
1305 if(use_permuted_idct)
1306#ifdef SIMPLE_IDCT
1307 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1308#else
1309 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1310#endif
1311 else
1312 for(i=0; i<64; i++) permutation[i]=i;
1313
2f349de2
MN
1314 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1315 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1316
c34270f5
FB
1317 if (use_permuted_idct) {
1318 /* permute for IDCT */
1319 for(i=0;i<64;i++) {
1320 j = zigzag_direct[i];
1321 zigzag_direct[i] = block_permute_op(j);
1322 j = ff_alternate_horizontal_scan[i];
1323 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1324 j = ff_alternate_vertical_scan[i];
1325 ff_alternate_vertical_scan[i] = block_permute_op(j);
1326 }
1327 block_permute(default_intra_matrix);
1328 block_permute(default_non_intra_matrix);
3bf43d42
MN
1329 block_permute(ff_mpeg4_default_intra_matrix);
1330 block_permute(ff_mpeg4_default_non_intra_matrix);
c34270f5 1331 }
badaf88e
MN
1332
1333 build_zigzag_end();
de6d9b64 1334}
43f1708f 1335
57060b1e
FB
1336/* remove any non bit exact operation (testing purpose) */
1337void avcodec_set_bit_exact(void)
1338{
1339#ifdef HAVE_MMX
1340 dsputil_set_bit_exact_mmx();
1341#endif
1342}
1343
43f1708f
J
1344void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1345 int orig_linesize[3], int coded_linesize,
1346 AVCodecContext *avctx)
1347{
1348 int quad, diff, x, y;
1349 UINT8 *orig, *coded;
1350 UINT32 *sq = squareTbl + 256;
1351
1352 quad = 0;
1353 diff = 0;
1354
1355 /* Luminance */
1356 orig = orig_image[0];
1357 coded = coded_image[0];
1358
1359 for (y=0;y<avctx->height;y++) {
1360 for (x=0;x<avctx->width;x++) {
1361 diff = *(orig + x) - *(coded + x);
1362 quad += sq[diff];
1363 }
1364 orig += orig_linesize[0];
1365 coded += coded_linesize;
1366 }
1367
1368 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1369
1370 if (avctx->psnr_y) {
1371 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1372 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1373 } else
1374 avctx->psnr_y = 99.99;
1375}
1376