error concealment needs the mbintra_table so it should allways be allocated
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
d962f6fd 23#include "simple_idct.h"
de6d9b64 24
4af7bcc1 25void (*ff_idct)(DCTELEM *block);
8ee14970
FB
26void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
03c94ede 28void (*av_fdct)(DCTELEM *block);
de6d9b64 29void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
9dbcbd92 30void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
de6d9b64
FB
31void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
44eb4951 33void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
649c00c9 34void (*clear_blocks)(DCTELEM *blocks);
de6d9b64
FB
35
36op_pixels_abs_func pix_abs16x16;
37op_pixels_abs_func pix_abs16x16_x2;
38op_pixels_abs_func pix_abs16x16_y2;
39op_pixels_abs_func pix_abs16x16_xy2;
40
ba6802de
MN
41op_pixels_abs_func pix_abs8x8;
42op_pixels_abs_func pix_abs8x8_x2;
43op_pixels_abs_func pix_abs8x8_y2;
44op_pixels_abs_func pix_abs8x8_xy2;
45
0cfa9713 46UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
47UINT32 squareTbl[512];
48
adc09b2e
MK
49extern INT16 ff_mpeg1_default_intra_matrix[64];
50extern INT16 ff_mpeg1_default_non_intra_matrix[64];
f0ca2e1b
ZK
51extern INT16 ff_mpeg4_default_intra_matrix[64];
52extern INT16 ff_mpeg4_default_non_intra_matrix[64];
e0eac44e
FB
53
54UINT8 zigzag_direct[64] = {
55 0, 1, 8, 16, 9, 2, 3, 10,
56 17, 24, 32, 25, 18, 11, 4, 5,
57 12, 19, 26, 33, 40, 48, 41, 34,
58 27, 20, 13, 6, 7, 14, 21, 28,
59 35, 42, 49, 56, 57, 50, 43, 36,
60 29, 22, 15, 23, 30, 37, 44, 51,
61 58, 59, 52, 45, 38, 31, 39, 46,
62 53, 60, 61, 54, 47, 55, 62, 63
63};
64
2f349de2
MN
65/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66UINT16 __align8 inv_zigzag_direct16[64];
67
68/* not permutated zigzag_direct for MMX quantizer */
69UINT8 zigzag_direct_noperm[64];
70
e0eac44e
FB
71UINT8 ff_alternate_horizontal_scan[64] = {
72 0, 1, 2, 3, 8, 9, 16, 17,
73 10, 11, 4, 5, 6, 7, 15, 14,
74 13, 12, 19, 18, 24, 25, 32, 33,
75 26, 27, 20, 21, 22, 23, 28, 29,
76 30, 31, 34, 35, 40, 41, 48, 49,
77 42, 43, 36, 37, 38, 39, 44, 45,
78 46, 47, 50, 51, 56, 57, 58, 59,
79 52, 53, 54, 55, 60, 61, 62, 63,
80};
81
82UINT8 ff_alternate_vertical_scan[64] = {
83 0, 8, 16, 24, 1, 9, 2, 10,
84 17, 25, 32, 40, 48, 56, 57, 49,
85 41, 33, 26, 18, 3, 11, 4, 12,
86 19, 27, 34, 42, 50, 58, 35, 43,
87 51, 59, 20, 28, 5, 13, 6, 14,
88 21, 29, 36, 44, 52, 60, 37, 45,
89 53, 61, 22, 30, 7, 15, 23, 31,
90 38, 46, 54, 62, 39, 47, 55, 63,
91};
92
e4986da9
J
93#ifdef SIMPLE_IDCT
94
0a8d8945 95/* Input permutation for the simple_idct_mmx */
5a240838 96static UINT8 simple_mmx_permutation[64]={
0a8d8945
MN
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
5a240838 105};
e4986da9 106#endif
5a240838 107
2f349de2
MN
108/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
109UINT32 inverse[256]={
110 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
111 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
112 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
113 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
114 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
115 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
116 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
117 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
118 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
119 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
120 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
121 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
122 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
123 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
124 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
125 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
126 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
127 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
128 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
129 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
130 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
131 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
132 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
133 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
134 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
135 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
136 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
137 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
138 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
139 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
140 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
141 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
142};
143
badaf88e
MN
144/* used to skip zeros at the end */
145UINT8 zigzag_end[64];
146
5a240838
MN
147UINT8 permutation[64];
148//UINT8 invPermutation[64];
149
20695ec9 150static void build_zigzag_end(void)
badaf88e
MN
151{
152 int lastIndex;
153 int lastIndexAfterPerm=0;
154 for(lastIndex=0; lastIndex<64; lastIndex++)
155 {
156 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
157 lastIndexAfterPerm= zigzag_direct[lastIndex];
158 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
159 }
160}
161
c13e1abd 162void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
de6d9b64 163{
de6d9b64
FB
164 int i;
165
166 /* read the pixels */
de6d9b64 167 for(i=0;i<8;i++) {
c13e1abd
FH
168 block[0] = pixels[0];
169 block[1] = pixels[1];
170 block[2] = pixels[2];
171 block[3] = pixels[3];
172 block[4] = pixels[4];
173 block[5] = pixels[5];
174 block[6] = pixels[6];
175 block[7] = pixels[7];
176 pixels += line_size;
177 block += 8;
de6d9b64
FB
178 }
179}
180
c13e1abd
FH
181void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
182 int stride){
9dbcbd92
MN
183 int i;
184
185 /* read the pixels */
9dbcbd92 186 for(i=0;i<8;i++) {
c13e1abd
FH
187 block[0] = s1[0] - s2[0];
188 block[1] = s1[1] - s2[1];
189 block[2] = s1[2] - s2[2];
190 block[3] = s1[3] - s2[3];
191 block[4] = s1[4] - s2[4];
192 block[5] = s1[5] - s2[5];
193 block[6] = s1[6] - s2[6];
194 block[7] = s1[7] - s2[7];
9dbcbd92
MN
195 s1 += stride;
196 s2 += stride;
c13e1abd 197 block += 8;
9dbcbd92
MN
198 }
199}
200
201
c13e1abd
FH
202void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
203 int line_size)
de6d9b64 204{
de6d9b64
FB
205 int i;
206 UINT8 *cm = cropTbl + MAX_NEG_CROP;
207
208 /* read the pixels */
de6d9b64 209 for(i=0;i<8;i++) {
c13e1abd
FH
210 pixels[0] = cm[block[0]];
211 pixels[1] = cm[block[1]];
212 pixels[2] = cm[block[2]];
213 pixels[3] = cm[block[3]];
214 pixels[4] = cm[block[4]];
215 pixels[5] = cm[block[5]];
216 pixels[6] = cm[block[6]];
217 pixels[7] = cm[block[7]];
218
219 pixels += line_size;
220 block += 8;
de6d9b64
FB
221 }
222}
223
c13e1abd
FH
224void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
225 int line_size)
de6d9b64 226{
de6d9b64
FB
227 int i;
228 UINT8 *cm = cropTbl + MAX_NEG_CROP;
229
230 /* read the pixels */
de6d9b64 231 for(i=0;i<8;i++) {
c13e1abd
FH
232 pixels[0] = cm[pixels[0] + block[0]];
233 pixels[1] = cm[pixels[1] + block[1]];
234 pixels[2] = cm[pixels[2] + block[2]];
235 pixels[3] = cm[pixels[3] + block[3]];
236 pixels[4] = cm[pixels[4] + block[4]];
237 pixels[5] = cm[pixels[5] + block[5]];
238 pixels[6] = cm[pixels[6] + block[6]];
239 pixels[7] = cm[pixels[7] + block[7]];
240 pixels += line_size;
241 block += 8;
de6d9b64
FB
242 }
243}
244
59fe111e
MN
245#if 0
246
247#define PIXOP2(OPNAME, OP) \
248void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
249{\
250 int i;\
251 for(i=0; i<h; i++){\
252 OP(*((uint64_t*)block), LD64(pixels));\
253 pixels+=line_size;\
254 block +=line_size;\
255 }\
256}\
257\
258void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
259{\
260 int i;\
261 for(i=0; i<h; i++){\
262 const uint64_t a= LD64(pixels );\
263 const uint64_t b= LD64(pixels+1);\
264 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
265 pixels+=line_size;\
266 block +=line_size;\
267 }\
268}\
269\
270void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
271{\
272 int i;\
273 for(i=0; i<h; i++){\
274 const uint64_t a= LD64(pixels );\
275 const uint64_t b= LD64(pixels+1);\
276 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
277 pixels+=line_size;\
278 block +=line_size;\
279 }\
280}\
281\
282void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
283{\
284 int i;\
285 for(i=0; i<h; i++){\
286 const uint64_t a= LD64(pixels );\
287 const uint64_t b= LD64(pixels+line_size);\
288 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
289 pixels+=line_size;\
290 block +=line_size;\
291 }\
292}\
293\
294void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
295{\
296 int i;\
297 for(i=0; i<h; i++){\
298 const uint64_t a= LD64(pixels );\
299 const uint64_t b= LD64(pixels+line_size);\
300 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
301 pixels+=line_size;\
302 block +=line_size;\
303 }\
304}\
305\
306void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
307{\
308 int i;\
309 const uint64_t a= LD64(pixels );\
310 const uint64_t b= LD64(pixels+1);\
311 uint64_t l0= (a&0x0303030303030303ULL)\
312 + (b&0x0303030303030303ULL)\
313 + 0x0202020202020202ULL;\
314 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
315 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
316 uint64_t l1,h1;\
317\
318 pixels+=line_size;\
319 for(i=0; i<h; i+=2){\
320 uint64_t a= LD64(pixels );\
321 uint64_t b= LD64(pixels+1);\
322 l1= (a&0x0303030303030303ULL)\
323 + (b&0x0303030303030303ULL);\
324 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
325 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
326 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
327 pixels+=line_size;\
328 block +=line_size;\
329 a= LD64(pixels );\
330 b= LD64(pixels+1);\
331 l0= (a&0x0303030303030303ULL)\
332 + (b&0x0303030303030303ULL)\
333 + 0x0202020202020202ULL;\
334 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
335 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
336 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
337 pixels+=line_size;\
338 block +=line_size;\
339 }\
340}\
341\
342void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
343{\
344 int i;\
345 const uint64_t a= LD64(pixels );\
346 const uint64_t b= LD64(pixels+1);\
347 uint64_t l0= (a&0x0303030303030303ULL)\
348 + (b&0x0303030303030303ULL)\
349 + 0x0101010101010101ULL;\
350 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
351 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
352 uint64_t l1,h1;\
353\
354 pixels+=line_size;\
355 for(i=0; i<h; i+=2){\
356 uint64_t a= LD64(pixels );\
357 uint64_t b= LD64(pixels+1);\
358 l1= (a&0x0303030303030303ULL)\
359 + (b&0x0303030303030303ULL);\
360 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
361 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
362 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
363 pixels+=line_size;\
364 block +=line_size;\
365 a= LD64(pixels );\
366 b= LD64(pixels+1);\
367 l0= (a&0x0303030303030303ULL)\
368 + (b&0x0303030303030303ULL)\
369 + 0x0101010101010101ULL;\
370 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
371 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
372 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
373 pixels+=line_size;\
374 block +=line_size;\
375 }\
376}\
377\
378void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
379 OPNAME ## _pixels,\
380 OPNAME ## _pixels_x2,\
381 OPNAME ## _pixels_y2,\
382 OPNAME ## _pixels_xy2,\
383};\
384\
385void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
386 OPNAME ## _pixels,\
387 OPNAME ## _no_rnd_pixels_x2,\
388 OPNAME ## _no_rnd_pixels_y2,\
389 OPNAME ## _no_rnd_pixels_xy2,\
390};
391
392#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
393#else // 64 bit variant
394
395#define PIXOP2(OPNAME, OP) \
396void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
397{\
398 int i;\
399 for(i=0; i<h; i++){\
400 OP(*((uint32_t*)(block )), LD32(pixels ));\
401 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
402 pixels+=line_size;\
403 block +=line_size;\
404 }\
405}\
406\
407void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
408{\
409 int i;\
410 for(i=0; i<h; i++){\
411 int j;\
412 for(j=0; j<2; j++){\
413 const uint32_t a= LD32(pixels );\
414 const uint32_t b= LD32(pixels+1);\
415 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
416 pixels+=4;\
417 block +=4;\
418 }\
419 pixels+=line_size-8;\
420 block +=line_size-8;\
421 }\
422}\
423\
424void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
425{\
426 int i;\
427 for(i=0; i<h; i++){\
428 int j;\
429 for(j=0; j<2; j++){\
430 const uint32_t a= LD32(pixels );\
431 const uint32_t b= LD32(pixels+1);\
432 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
433 pixels+=4;\
434 block +=4;\
435 }\
436 pixels+=line_size-8;\
437 block +=line_size-8;\
438 }\
439}\
440\
441void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
442{\
443 int i;\
444 for(i=0; i<h; i++){\
445 int j;\
446 for(j=0; j<2; j++){\
447 const uint32_t a= LD32(pixels );\
448 const uint32_t b= LD32(pixels+line_size);\
449 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
450 pixels+=4;\
451 block +=4;\
452 }\
453 pixels+=line_size-8;\
454 block +=line_size-8;\
455 }\
456}\
457\
458void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
459{\
460 int i;\
461 for(i=0; i<h; i++){\
462 int j;\
463 for(j=0; j<2; j++){\
464 const uint32_t a= LD32(pixels );\
465 const uint32_t b= LD32(pixels+line_size);\
466 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
467 pixels+=4;\
468 block +=4;\
469 }\
470 pixels+=line_size-8;\
471 block +=line_size-8;\
472 }\
473}\
474\
475void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
476{\
477 int j;\
478 for(j=0; j<2; j++){\
479 int i;\
480 const uint32_t a= LD32(pixels );\
481 const uint32_t b= LD32(pixels+1);\
482 uint32_t l0= (a&0x03030303UL)\
483 + (b&0x03030303UL)\
484 + 0x02020202UL;\
485 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
486 + ((b&0xFCFCFCFCUL)>>2);\
487 uint32_t l1,h1;\
488\
489 pixels+=line_size;\
490 for(i=0; i<h; i+=2){\
491 uint32_t a= LD32(pixels );\
492 uint32_t b= LD32(pixels+1);\
493 l1= (a&0x03030303UL)\
494 + (b&0x03030303UL);\
495 h1= ((a&0xFCFCFCFCUL)>>2)\
496 + ((b&0xFCFCFCFCUL)>>2);\
497 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
498 pixels+=line_size;\
499 block +=line_size;\
500 a= LD32(pixels );\
501 b= LD32(pixels+1);\
502 l0= (a&0x03030303UL)\
503 + (b&0x03030303UL)\
504 + 0x02020202UL;\
505 h0= ((a&0xFCFCFCFCUL)>>2)\
506 + ((b&0xFCFCFCFCUL)>>2);\
507 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
508 pixels+=line_size;\
509 block +=line_size;\
510 }\
511 pixels+=4-line_size*(h+1);\
512 block +=4-line_size*h;\
513 }\
514}\
515\
516void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
517{\
518 int j;\
519 for(j=0; j<2; j++){\
520 int i;\
521 const uint32_t a= LD32(pixels );\
522 const uint32_t b= LD32(pixels+1);\
523 uint32_t l0= (a&0x03030303UL)\
524 + (b&0x03030303UL)\
525 + 0x01010101UL;\
526 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
527 + ((b&0xFCFCFCFCUL)>>2);\
528 uint32_t l1,h1;\
529\
530 pixels+=line_size;\
531 for(i=0; i<h; i+=2){\
532 uint32_t a= LD32(pixels );\
533 uint32_t b= LD32(pixels+1);\
534 l1= (a&0x03030303UL)\
535 + (b&0x03030303UL);\
536 h1= ((a&0xFCFCFCFCUL)>>2)\
537 + ((b&0xFCFCFCFCUL)>>2);\
538 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
539 pixels+=line_size;\
540 block +=line_size;\
541 a= LD32(pixels );\
542 b= LD32(pixels+1);\
543 l0= (a&0x03030303UL)\
544 + (b&0x03030303UL)\
545 + 0x01010101UL;\
546 h0= ((a&0xFCFCFCFCUL)>>2)\
547 + ((b&0xFCFCFCFCUL)>>2);\
548 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
549 pixels+=line_size;\
550 block +=line_size;\
551 }\
552 pixels+=4-line_size*(h+1);\
553 block +=4-line_size*h;\
554 }\
555}\
556\
557void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
558 OPNAME ## _pixels,\
559 OPNAME ## _pixels_x2,\
560 OPNAME ## _pixels_y2,\
561 OPNAME ## _pixels_xy2,\
562};\
563\
564void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
565 OPNAME ## _pixels,\
566 OPNAME ## _no_rnd_pixels_x2,\
567 OPNAME ## _no_rnd_pixels_y2,\
568 OPNAME ## _no_rnd_pixels_xy2,\
569};
570#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
571#endif
572
573#define op_put(a, b) a = b
574
575PIXOP2(avg, op_avg)
576PIXOP2(put, op_put)
577#undef op_avg
578#undef op_put
579
57060b1e 580#if 0
59fe111e 581/* FIXME this stuff could be removed as its ot really used anymore */
de6d9b64
FB
582#define PIXOP(BTYPE, OPNAME, OP, INCR) \
583 \
584static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
585{ \
586 BTYPE *p; \
587 const UINT8 *pix; \
588 \
589 p = block; \
590 pix = pixels; \
591 do { \
592 OP(p[0], pix[0]); \
593 OP(p[1], pix[1]); \
594 OP(p[2], pix[2]); \
595 OP(p[3], pix[3]); \
596 OP(p[4], pix[4]); \
597 OP(p[5], pix[5]); \
598 OP(p[6], pix[6]); \
599 OP(p[7], pix[7]); \
600 pix += line_size; \
601 p += INCR; \
602 } while (--h);; \
603} \
604 \
605static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
606{ \
607 BTYPE *p; \
608 const UINT8 *pix; \
609 \
610 p = block; \
611 pix = pixels; \
612 do { \
613 OP(p[0], avg2(pix[0], pix[1])); \
614 OP(p[1], avg2(pix[1], pix[2])); \
615 OP(p[2], avg2(pix[2], pix[3])); \
616 OP(p[3], avg2(pix[3], pix[4])); \
617 OP(p[4], avg2(pix[4], pix[5])); \
618 OP(p[5], avg2(pix[5], pix[6])); \
619 OP(p[6], avg2(pix[6], pix[7])); \
620 OP(p[7], avg2(pix[7], pix[8])); \
621 pix += line_size; \
622 p += INCR; \
623 } while (--h); \
624} \
625 \
626static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
627{ \
628 BTYPE *p; \
629 const UINT8 *pix; \
630 const UINT8 *pix1; \
631 \
632 p = block; \
633 pix = pixels; \
634 pix1 = pixels + line_size; \
635 do { \
636 OP(p[0], avg2(pix[0], pix1[0])); \
637 OP(p[1], avg2(pix[1], pix1[1])); \
638 OP(p[2], avg2(pix[2], pix1[2])); \
639 OP(p[3], avg2(pix[3], pix1[3])); \
640 OP(p[4], avg2(pix[4], pix1[4])); \
641 OP(p[5], avg2(pix[5], pix1[5])); \
642 OP(p[6], avg2(pix[6], pix1[6])); \
643 OP(p[7], avg2(pix[7], pix1[7])); \
644 pix += line_size; \
645 pix1 += line_size; \
646 p += INCR; \
647 } while(--h); \
648} \
649 \
650static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
651{ \
652 BTYPE *p; \
653 const UINT8 *pix; \
654 const UINT8 *pix1; \
655 \
656 p = block; \
657 pix = pixels; \
658 pix1 = pixels + line_size; \
659 do { \
660 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
661 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
662 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
663 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
664 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
665 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
666 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
667 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
668 pix += line_size; \
669 pix1 += line_size; \
670 p += INCR; \
671 } while(--h); \
672} \
673 \
674void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
675 OPNAME ## _pixels, \
676 OPNAME ## _pixels_x2, \
677 OPNAME ## _pixels_y2, \
678 OPNAME ## _pixels_xy2, \
679};
680
de6d9b64
FB
681/* rounding primitives */
682#define avg2(a,b) ((a+b+1)>>1)
683#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
684
de6d9b64
FB
685#define op_avg(a, b) a = avg2(a, b)
686#define op_sub(a, b) a -= b
687
de6d9b64
FB
688PIXOP(DCTELEM, sub, op_sub, 8)
689
690/* not rounding primitives */
691#undef avg2
692#undef avg4
693#define avg2(a,b) ((a+b)>>1)
694#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
695
de6d9b64
FB
696/* motion estimation */
697
698#undef avg2
699#undef avg4
57060b1e
FB
700#endif
701
de6d9b64
FB
702#define avg2(a,b) ((a+b+1)>>1)
703#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
704
44eb4951
MN
705static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
706{
707 const int A=(16-x16)*(16-y16);
708 const int B=( x16)*(16-y16);
709 const int C=(16-x16)*( y16);
710 const int D=( x16)*( y16);
711 int i;
712 rounder= 128 - rounder;
713
714 for(i=0; i<h; i++)
715 {
716 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
717 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
718 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
719 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
720 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
721 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
722 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
723 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
724 dst+= srcStride;
725 src+= srcStride;
726 }
727}
728
729static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
730{
731 UINT8 *cm = cropTbl + MAX_NEG_CROP;
732 int i;
733 for(i=0; i<h; i++)
734 {
ba6802de
MN
735 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
736 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
737 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
738 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
739 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
740 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
741 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
742 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
44eb4951
MN
743 dst+=dstStride;
744 src+=srcStride;
745 }
746}
747
748static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
749{
750 UINT8 *cm = cropTbl + MAX_NEG_CROP;
751 int i;
752 for(i=0; i<w; i++)
753 {
754 const int src0= src[0*srcStride];
755 const int src1= src[1*srcStride];
756 const int src2= src[2*srcStride];
757 const int src3= src[3*srcStride];
758 const int src4= src[4*srcStride];
759 const int src5= src[5*srcStride];
760 const int src6= src[6*srcStride];
761 const int src7= src[7*srcStride];
762 const int src8= src[8*srcStride];
ba6802de
MN
763 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
764 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
765 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
766 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
767 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
768 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
769 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
770 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
44eb4951
MN
771 dst++;
772 src++;
773 }
774}
775
776static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
777{
778 int i;
779 for(i=0; i<8; i++)
780 {
781 dst[0]= src[0];
782 dst[1]= src[1];
783 dst[2]= src[2];
784 dst[3]= src[3];
785 dst[4]= src[4];
786 dst[5]= src[5];
787 dst[6]= src[6];
788 dst[7]= src[7];
789 dst+=dstStride;
790 src+=srcStride;
791 }
792}
793
794static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
795{
796 int i;
797 for(i=0; i<8; i++)
798 {
799 dst[0]= (src1[0] + src2[0] + r)>>1;
800 dst[1]= (src1[1] + src2[1] + r)>>1;
801 dst[2]= (src1[2] + src2[2] + r)>>1;
802 dst[3]= (src1[3] + src2[3] + r)>>1;
803 dst[4]= (src1[4] + src2[4] + r)>>1;
804 dst[5]= (src1[5] + src2[5] + r)>>1;
805 dst[6]= (src1[6] + src2[6] + r)>>1;
806 dst[7]= (src1[7] + src2[7] + r)>>1;
807 dst+=dstStride;
808 src1+=srcStride;
809 src2+=8;
810 }
811}
812
813static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
814{
815 int i;
816 for(i=0; i<8; i++)
817 {
818 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
819 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
820 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
821 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
822 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
823 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
824 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
825 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
826 dst+=dstStride;
827 src1+=srcStride;
828 src2+=8;
7ff037e9 829 src3+=8;
44eb4951
MN
830 src4+=8;
831 }
832}
833
834#define QPEL_MC(r, name) \
835static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
836{\
837 put_block(dst, src, dstStride, srcStride);\
838}\
839\
840static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
841{\
842 UINT8 half[64];\
ba6802de 843 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
844 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
845}\
846\
847static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
848{\
ba6802de 849 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
44eb4951
MN
850}\
851\
852static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
853{\
854 UINT8 half[64];\
ba6802de 855 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
856 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
857}\
858\
859static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
860{\
861 UINT8 half[64];\
ba6802de 862 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
863 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
864}\
865\
866static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
867{\
ba6802de 868 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
44eb4951
MN
869}\
870\
871static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
872{\
873 UINT8 half[64];\
ba6802de 874 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
875 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
876}\
877static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
878{\
879 UINT8 halfH[72];\
7ff037e9 880 UINT8 halfV[64];\
44eb4951 881 UINT8 halfHV[64];\
ba6802de
MN
882 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
883 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
884 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
885 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
886}\
887static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
888{\
889 UINT8 halfH[72];\
7ff037e9 890 UINT8 halfV[64];\
44eb4951 891 UINT8 halfHV[64];\
ba6802de
MN
892 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
893 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
894 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
895 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
896}\
897static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
898{\
899 UINT8 halfH[72];\
7ff037e9 900 UINT8 halfV[64];\
44eb4951 901 UINT8 halfHV[64];\
ba6802de
MN
902 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
903 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
904 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 905 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
44eb4951
MN
906}\
907static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
908{\
909 UINT8 halfH[72];\
7ff037e9 910 UINT8 halfV[64];\
44eb4951 911 UINT8 halfHV[64];\
ba6802de
MN
912 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
913 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
914 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 915 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
44eb4951
MN
916}\
917static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
918{\
919 UINT8 halfH[72];\
920 UINT8 halfHV[64];\
ba6802de
MN
921 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
922 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
923 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
924}\
925static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
926{\
927 UINT8 halfH[72];\
928 UINT8 halfHV[64];\
ba6802de
MN
929 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
930 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
931 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
932}\
933static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
934{\
935 UINT8 halfH[72];\
7ff037e9 936 UINT8 halfV[64];\
44eb4951 937 UINT8 halfHV[64];\
ba6802de
MN
938 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
939 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
940 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 941 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
44eb4951
MN
942}\
943static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
944{\
945 UINT8 halfH[72];\
7ff037e9 946 UINT8 halfV[64];\
44eb4951 947 UINT8 halfHV[64];\
ba6802de
MN
948 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
949 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
950 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 951 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
44eb4951
MN
952}\
953static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
954{\
955 UINT8 halfH[72];\
ba6802de
MN
956 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
957 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
44eb4951
MN
958}\
959qpel_mc_func qpel_mc ## name ## _tab[16]={ \
960 qpel_mc00_c ## name, \
961 qpel_mc10_c ## name, \
962 qpel_mc20_c ## name, \
963 qpel_mc30_c ## name, \
964 qpel_mc01_c ## name, \
965 qpel_mc11_c ## name, \
966 qpel_mc21_c ## name, \
967 qpel_mc31_c ## name, \
968 qpel_mc02_c ## name, \
969 qpel_mc12_c ## name, \
970 qpel_mc22_c ## name, \
971 qpel_mc32_c ## name, \
972 qpel_mc03_c ## name, \
973 qpel_mc13_c ## name, \
974 qpel_mc23_c ## name, \
975 qpel_mc33_c ## name, \
976};
977
978QPEL_MC(0, _rnd)
979QPEL_MC(1, _no_rnd)
980
ba6802de 981int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
982{
983 int s, i;
984
985 s = 0;
ba6802de 986 for(i=0;i<16;i++) {
de6d9b64
FB
987 s += abs(pix1[0] - pix2[0]);
988 s += abs(pix1[1] - pix2[1]);
989 s += abs(pix1[2] - pix2[2]);
990 s += abs(pix1[3] - pix2[3]);
991 s += abs(pix1[4] - pix2[4]);
992 s += abs(pix1[5] - pix2[5]);
993 s += abs(pix1[6] - pix2[6]);
994 s += abs(pix1[7] - pix2[7]);
995 s += abs(pix1[8] - pix2[8]);
996 s += abs(pix1[9] - pix2[9]);
997 s += abs(pix1[10] - pix2[10]);
998 s += abs(pix1[11] - pix2[11]);
999 s += abs(pix1[12] - pix2[12]);
1000 s += abs(pix1[13] - pix2[13]);
1001 s += abs(pix1[14] - pix2[14]);
1002 s += abs(pix1[15] - pix2[15]);
1003 pix1 += line_size;
1004 pix2 += line_size;
1005 }
1006 return s;
1007}
1008
ba6802de 1009int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1010{
1011 int s, i;
1012
1013 s = 0;
ba6802de 1014 for(i=0;i<16;i++) {
de6d9b64
FB
1015 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1016 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1017 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1018 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1019 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1020 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1021 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1022 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1023 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1024 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1025 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1026 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1027 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1028 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1029 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1030 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1031 pix1 += line_size;
1032 pix2 += line_size;
1033 }
1034 return s;
1035}
1036
ba6802de 1037int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1038{
1039 int s, i;
1040 UINT8 *pix3 = pix2 + line_size;
1041
1042 s = 0;
ba6802de 1043 for(i=0;i<16;i++) {
de6d9b64
FB
1044 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1045 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1046 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1047 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1048 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1049 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1050 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1051 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1052 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1053 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1054 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1055 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1056 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1057 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1058 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1059 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1060 pix1 += line_size;
1061 pix2 += line_size;
1062 pix3 += line_size;
1063 }
1064 return s;
1065}
1066
ba6802de 1067int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1068{
1069 int s, i;
1070 UINT8 *pix3 = pix2 + line_size;
1071
1072 s = 0;
ba6802de 1073 for(i=0;i<16;i++) {
de6d9b64
FB
1074 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1075 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1076 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1077 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1078 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1079 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1080 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1081 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1082 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1083 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1084 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1085 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1086 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1087 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1088 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1089 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1090 pix1 += line_size;
1091 pix2 += line_size;
1092 pix3 += line_size;
1093 }
1094 return s;
1095}
1096
ba6802de
MN
1097int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1098{
1099 int s, i;
1100
1101 s = 0;
1102 for(i=0;i<8;i++) {
1103 s += abs(pix1[0] - pix2[0]);
1104 s += abs(pix1[1] - pix2[1]);
1105 s += abs(pix1[2] - pix2[2]);
1106 s += abs(pix1[3] - pix2[3]);
1107 s += abs(pix1[4] - pix2[4]);
1108 s += abs(pix1[5] - pix2[5]);
1109 s += abs(pix1[6] - pix2[6]);
1110 s += abs(pix1[7] - pix2[7]);
1111 pix1 += line_size;
1112 pix2 += line_size;
1113 }
1114 return s;
1115}
1116
1117int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1118{
1119 int s, i;
1120
1121 s = 0;
1122 for(i=0;i<8;i++) {
1123 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1124 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1125 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1126 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1127 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1128 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1129 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1130 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1131 pix1 += line_size;
1132 pix2 += line_size;
1133 }
1134 return s;
1135}
1136
1137int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1138{
1139 int s, i;
1140 UINT8 *pix3 = pix2 + line_size;
1141
1142 s = 0;
1143 for(i=0;i<8;i++) {
1144 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1145 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1146 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1147 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1148 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1149 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1150 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1151 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1152 pix1 += line_size;
1153 pix2 += line_size;
1154 pix3 += line_size;
1155 }
1156 return s;
1157}
1158
1159int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1160{
1161 int s, i;
1162 UINT8 *pix3 = pix2 + line_size;
1163
1164 s = 0;
1165 for(i=0;i<8;i++) {
1166 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1167 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1168 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1169 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1170 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1171 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1172 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1173 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1174 pix1 += line_size;
1175 pix2 += line_size;
1176 pix3 += line_size;
1177 }
1178 return s;
1179}
1180
e0eac44e
FB
1181/* permute block according so that it corresponds to the MMX idct
1182 order */
d962f6fd 1183#ifdef SIMPLE_IDCT
5a240838 1184 /* general permutation, but perhaps slightly slower */
d962f6fd
A
1185void block_permute(INT16 *block)
1186{
1187 int i;
1188 INT16 temp[64];
1189
d962f6fd
A
1190 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1191
1192 for(i=0; i<64; i++) block[i] = temp[i];
d962f6fd 1193}
d962f6fd
A
1194#else
1195
e0eac44e 1196void block_permute(INT16 *block)
de6d9b64 1197{
e0eac44e 1198 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
de6d9b64
FB
1199 int i;
1200
e0eac44e
FB
1201 for(i=0;i<8;i++) {
1202 tmp1 = block[1];
1203 tmp2 = block[2];
1204 tmp3 = block[3];
1205 tmp4 = block[4];
1206 tmp5 = block[5];
1207 tmp6 = block[6];
1208 block[1] = tmp2;
1209 block[2] = tmp4;
1210 block[3] = tmp6;
1211 block[4] = tmp1;
1212 block[5] = tmp3;
1213 block[6] = tmp5;
1214 block += 8;
1215 }
1216}
d962f6fd 1217#endif
e0eac44e 1218
649c00c9
MN
1219void clear_blocks_c(DCTELEM *blocks)
1220{
1221 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1222}
1223
8ee14970
FB
1224/* XXX: those functions should be suppressed ASAP when all IDCTs are
1225 converted */
1226void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1227{
1228 ff_idct (block);
1229 put_pixels_clamped(block, dest, line_size);
1230}
1231
1232void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1233{
1234 ff_idct (block);
1235 add_pixels_clamped(block, dest, line_size);
1236}
1237
e0eac44e
FB
1238void dsputil_init(void)
1239{
1240 int i, j;
c34270f5 1241 int use_permuted_idct;
e0eac44e 1242
de6d9b64
FB
1243 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1244 for(i=0;i<MAX_NEG_CROP;i++) {
1245 cropTbl[i] = 0;
1246 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1247 }
1248
1249 for(i=0;i<512;i++) {
1250 squareTbl[i] = (i - 256) * (i - 256);
1251 }
1252
d962f6fd 1253#ifdef SIMPLE_IDCT
8ee14970 1254 ff_idct = NULL;
d962f6fd 1255#else
4af7bcc1 1256 ff_idct = j_rev_dct;
d962f6fd 1257#endif
de6d9b64 1258 get_pixels = get_pixels_c;
9dbcbd92 1259 diff_pixels = diff_pixels_c;
de6d9b64
FB
1260 put_pixels_clamped = put_pixels_clamped_c;
1261 add_pixels_clamped = add_pixels_clamped_c;
44eb4951 1262 gmc1= gmc1_c;
649c00c9 1263 clear_blocks= clear_blocks_c;
de6d9b64 1264
ba6802de
MN
1265 pix_abs16x16 = pix_abs16x16_c;
1266 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1267 pix_abs16x16_y2 = pix_abs16x16_y2_c;
de6d9b64 1268 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
ba6802de
MN
1269 pix_abs8x8 = pix_abs8x8_c;
1270 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1271 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1272 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
03c94ede 1273 av_fdct = fdct_ifast;
de6d9b64 1274
c34270f5 1275 use_permuted_idct = 1;
e0eac44e 1276
980fc7b8 1277#ifdef HAVE_MMX
de6d9b64
FB
1278 dsputil_init_mmx();
1279#endif
3d03c0a2
FB
1280#ifdef ARCH_ARMV4L
1281 dsputil_init_armv4l();
1282#endif
c34270f5
FB
1283#ifdef HAVE_MLIB
1284 dsputil_init_mlib();
1285 use_permuted_idct = 0;
1286#endif
1e98dffb
NK
1287#ifdef ARCH_ALPHA
1288 dsputil_init_alpha();
1289 use_permuted_idct = 0;
1290#endif
c34270f5 1291
d962f6fd 1292#ifdef SIMPLE_IDCT
8ee14970
FB
1293 if (ff_idct == NULL) {
1294 ff_idct_put = simple_idct_put;
1295 ff_idct_add = simple_idct_add;
1296 use_permuted_idct=0;
fc2bb4f4
MN
1297 }
1298#endif
1299 if(ff_idct != NULL) {
8ee14970
FB
1300 ff_idct_put = gen_idct_put;
1301 ff_idct_add = gen_idct_add;
1302 }
d962f6fd 1303
5a240838
MN
1304 if(use_permuted_idct)
1305#ifdef SIMPLE_IDCT
1306 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1307#else
1308 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1309#endif
1310 else
1311 for(i=0; i<64; i++) permutation[i]=i;
1312
2f349de2
MN
1313 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1314 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1315
c34270f5
FB
1316 if (use_permuted_idct) {
1317 /* permute for IDCT */
1318 for(i=0;i<64;i++) {
1319 j = zigzag_direct[i];
1320 zigzag_direct[i] = block_permute_op(j);
1321 j = ff_alternate_horizontal_scan[i];
1322 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1323 j = ff_alternate_vertical_scan[i];
1324 ff_alternate_vertical_scan[i] = block_permute_op(j);
1325 }
adc09b2e
MK
1326 block_permute(ff_mpeg1_default_intra_matrix);
1327 block_permute(ff_mpeg1_default_non_intra_matrix);
3bf43d42
MN
1328 block_permute(ff_mpeg4_default_intra_matrix);
1329 block_permute(ff_mpeg4_default_non_intra_matrix);
c34270f5 1330 }
badaf88e
MN
1331
1332 build_zigzag_end();
de6d9b64 1333}
43f1708f 1334
57060b1e
FB
1335/* remove any non bit exact operation (testing purpose) */
1336void avcodec_set_bit_exact(void)
1337{
1338#ifdef HAVE_MMX
1339 dsputil_set_bit_exact_mmx();
1340#endif
1341}
1342
43f1708f
J
1343void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1344 int orig_linesize[3], int coded_linesize,
1345 AVCodecContext *avctx)
1346{
1347 int quad, diff, x, y;
1348 UINT8 *orig, *coded;
1349 UINT32 *sq = squareTbl + 256;
1350
1351 quad = 0;
1352 diff = 0;
1353
1354 /* Luminance */
1355 orig = orig_image[0];
1356 coded = coded_image[0];
1357
1358 for (y=0;y<avctx->height;y++) {
1359 for (x=0;x<avctx->width;x++) {
1360 diff = *(orig + x) - *(coded + x);
1361 quad += sq[diff];
1362 }
1363 orig += orig_linesize[0];
1364 coded += coded_linesize;
1365 }
1366
1367 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1368
1369 if (avctx->psnr_y) {
1370 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1371 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1372 } else
1373 avctx->psnr_y = 99.99;
1374}
1375