Make dct_unquantize_h263 work on systems without MVI extension.
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
d962f6fd 23#include "simple_idct.h"
de6d9b64 24
4af7bcc1 25void (*ff_idct)(DCTELEM *block);
8ee14970
FB
26void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
03c94ede 28void (*av_fdct)(DCTELEM *block);
de6d9b64 29void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
9dbcbd92 30void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
de6d9b64
FB
31void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
44eb4951 33void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
649c00c9 34void (*clear_blocks)(DCTELEM *blocks);
de6d9b64
FB
35
36op_pixels_abs_func pix_abs16x16;
37op_pixels_abs_func pix_abs16x16_x2;
38op_pixels_abs_func pix_abs16x16_y2;
39op_pixels_abs_func pix_abs16x16_xy2;
40
ba6802de
MN
41op_pixels_abs_func pix_abs8x8;
42op_pixels_abs_func pix_abs8x8_x2;
43op_pixels_abs_func pix_abs8x8_y2;
44op_pixels_abs_func pix_abs8x8_xy2;
45
0cfa9713 46UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
47UINT32 squareTbl[512];
48
f0ca2e1b
ZK
49extern INT16 default_intra_matrix[64];
50extern INT16 default_non_intra_matrix[64];
51extern INT16 ff_mpeg4_default_intra_matrix[64];
52extern INT16 ff_mpeg4_default_non_intra_matrix[64];
e0eac44e
FB
53
54UINT8 zigzag_direct[64] = {
55 0, 1, 8, 16, 9, 2, 3, 10,
56 17, 24, 32, 25, 18, 11, 4, 5,
57 12, 19, 26, 33, 40, 48, 41, 34,
58 27, 20, 13, 6, 7, 14, 21, 28,
59 35, 42, 49, 56, 57, 50, 43, 36,
60 29, 22, 15, 23, 30, 37, 44, 51,
61 58, 59, 52, 45, 38, 31, 39, 46,
62 53, 60, 61, 54, 47, 55, 62, 63
63};
64
2f349de2
MN
65/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66UINT16 __align8 inv_zigzag_direct16[64];
67
68/* not permutated zigzag_direct for MMX quantizer */
69UINT8 zigzag_direct_noperm[64];
70
e0eac44e
FB
71UINT8 ff_alternate_horizontal_scan[64] = {
72 0, 1, 2, 3, 8, 9, 16, 17,
73 10, 11, 4, 5, 6, 7, 15, 14,
74 13, 12, 19, 18, 24, 25, 32, 33,
75 26, 27, 20, 21, 22, 23, 28, 29,
76 30, 31, 34, 35, 40, 41, 48, 49,
77 42, 43, 36, 37, 38, 39, 44, 45,
78 46, 47, 50, 51, 56, 57, 58, 59,
79 52, 53, 54, 55, 60, 61, 62, 63,
80};
81
82UINT8 ff_alternate_vertical_scan[64] = {
83 0, 8, 16, 24, 1, 9, 2, 10,
84 17, 25, 32, 40, 48, 56, 57, 49,
85 41, 33, 26, 18, 3, 11, 4, 12,
86 19, 27, 34, 42, 50, 58, 35, 43,
87 51, 59, 20, 28, 5, 13, 6, 14,
88 21, 29, 36, 44, 52, 60, 37, 45,
89 53, 61, 22, 30, 7, 15, 23, 31,
90 38, 46, 54, 62, 39, 47, 55, 63,
91};
92
e4986da9
J
93#ifdef SIMPLE_IDCT
94
0a8d8945 95/* Input permutation for the simple_idct_mmx */
5a240838 96static UINT8 simple_mmx_permutation[64]={
0a8d8945
MN
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
5a240838 105};
e4986da9 106#endif
5a240838 107
2f349de2
MN
108/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
109UINT32 inverse[256]={
110 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
111 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
112 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
113 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
114 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
115 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
116 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
117 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
118 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
119 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
120 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
121 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
122 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
123 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
124 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
125 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
126 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
127 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
128 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
129 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
130 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
131 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
132 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
133 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
134 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
135 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
136 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
137 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
138 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
139 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
140 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
141 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
142};
143
badaf88e
MN
144/* used to skip zeros at the end */
145UINT8 zigzag_end[64];
146
5a240838
MN
147UINT8 permutation[64];
148//UINT8 invPermutation[64];
149
20695ec9 150static void build_zigzag_end(void)
badaf88e
MN
151{
152 int lastIndex;
153 int lastIndexAfterPerm=0;
154 for(lastIndex=0; lastIndex<64; lastIndex++)
155 {
156 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
157 lastIndexAfterPerm= zigzag_direct[lastIndex];
158 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
159 }
160}
161
de6d9b64
FB
162void get_pixels_c(DCTELEM *block, const UINT8 *pixels, int line_size)
163{
164 DCTELEM *p;
165 const UINT8 *pix;
166 int i;
167
168 /* read the pixels */
169 p = block;
170 pix = pixels;
171 for(i=0;i<8;i++) {
172 p[0] = pix[0];
173 p[1] = pix[1];
174 p[2] = pix[2];
175 p[3] = pix[3];
176 p[4] = pix[4];
177 p[5] = pix[5];
178 p[6] = pix[6];
179 p[7] = pix[7];
180 pix += line_size;
181 p += 8;
182 }
183}
184
9dbcbd92
MN
185void diff_pixels_c(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride){
186 DCTELEM *p;
187 int i;
188
189 /* read the pixels */
190 p = block;
191 for(i=0;i<8;i++) {
192 p[0] = s1[0] - s2[0];
193 p[1] = s1[1] - s2[1];
194 p[2] = s1[2] - s2[2];
195 p[3] = s1[3] - s2[3];
196 p[4] = s1[4] - s2[4];
197 p[5] = s1[5] - s2[5];
198 p[6] = s1[6] - s2[6];
199 p[7] = s1[7] - s2[7];
200 s1 += stride;
201 s2 += stride;
202 p += 8;
203 }
204}
205
206
de6d9b64
FB
207void put_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
208{
209 const DCTELEM *p;
210 UINT8 *pix;
211 int i;
212 UINT8 *cm = cropTbl + MAX_NEG_CROP;
213
214 /* read the pixels */
215 p = block;
216 pix = pixels;
217 for(i=0;i<8;i++) {
218 pix[0] = cm[p[0]];
219 pix[1] = cm[p[1]];
220 pix[2] = cm[p[2]];
221 pix[3] = cm[p[3]];
222 pix[4] = cm[p[4]];
223 pix[5] = cm[p[5]];
224 pix[6] = cm[p[6]];
225 pix[7] = cm[p[7]];
226 pix += line_size;
227 p += 8;
228 }
229}
230
231void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size)
232{
233 const DCTELEM *p;
234 UINT8 *pix;
235 int i;
236 UINT8 *cm = cropTbl + MAX_NEG_CROP;
237
238 /* read the pixels */
239 p = block;
240 pix = pixels;
241 for(i=0;i<8;i++) {
242 pix[0] = cm[pix[0] + p[0]];
243 pix[1] = cm[pix[1] + p[1]];
244 pix[2] = cm[pix[2] + p[2]];
245 pix[3] = cm[pix[3] + p[3]];
246 pix[4] = cm[pix[4] + p[4]];
247 pix[5] = cm[pix[5] + p[5]];
248 pix[6] = cm[pix[6] + p[6]];
249 pix[7] = cm[pix[7] + p[7]];
250 pix += line_size;
251 p += 8;
252 }
253}
254
59fe111e
MN
255#if 0
256
257#define PIXOP2(OPNAME, OP) \
258void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
259{\
260 int i;\
261 for(i=0; i<h; i++){\
262 OP(*((uint64_t*)block), LD64(pixels));\
263 pixels+=line_size;\
264 block +=line_size;\
265 }\
266}\
267\
268void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
269{\
270 int i;\
271 for(i=0; i<h; i++){\
272 const uint64_t a= LD64(pixels );\
273 const uint64_t b= LD64(pixels+1);\
274 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
275 pixels+=line_size;\
276 block +=line_size;\
277 }\
278}\
279\
280void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
281{\
282 int i;\
283 for(i=0; i<h; i++){\
284 const uint64_t a= LD64(pixels );\
285 const uint64_t b= LD64(pixels+1);\
286 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
287 pixels+=line_size;\
288 block +=line_size;\
289 }\
290}\
291\
292void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
293{\
294 int i;\
295 for(i=0; i<h; i++){\
296 const uint64_t a= LD64(pixels );\
297 const uint64_t b= LD64(pixels+line_size);\
298 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
299 pixels+=line_size;\
300 block +=line_size;\
301 }\
302}\
303\
304void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
305{\
306 int i;\
307 for(i=0; i<h; i++){\
308 const uint64_t a= LD64(pixels );\
309 const uint64_t b= LD64(pixels+line_size);\
310 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
311 pixels+=line_size;\
312 block +=line_size;\
313 }\
314}\
315\
316void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
317{\
318 int i;\
319 const uint64_t a= LD64(pixels );\
320 const uint64_t b= LD64(pixels+1);\
321 uint64_t l0= (a&0x0303030303030303ULL)\
322 + (b&0x0303030303030303ULL)\
323 + 0x0202020202020202ULL;\
324 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
325 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
326 uint64_t l1,h1;\
327\
328 pixels+=line_size;\
329 for(i=0; i<h; i+=2){\
330 uint64_t a= LD64(pixels );\
331 uint64_t b= LD64(pixels+1);\
332 l1= (a&0x0303030303030303ULL)\
333 + (b&0x0303030303030303ULL);\
334 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
335 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
336 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
337 pixels+=line_size;\
338 block +=line_size;\
339 a= LD64(pixels );\
340 b= LD64(pixels+1);\
341 l0= (a&0x0303030303030303ULL)\
342 + (b&0x0303030303030303ULL)\
343 + 0x0202020202020202ULL;\
344 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
345 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
346 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
347 pixels+=line_size;\
348 block +=line_size;\
349 }\
350}\
351\
352void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
353{\
354 int i;\
355 const uint64_t a= LD64(pixels );\
356 const uint64_t b= LD64(pixels+1);\
357 uint64_t l0= (a&0x0303030303030303ULL)\
358 + (b&0x0303030303030303ULL)\
359 + 0x0101010101010101ULL;\
360 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
361 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
362 uint64_t l1,h1;\
363\
364 pixels+=line_size;\
365 for(i=0; i<h; i+=2){\
366 uint64_t a= LD64(pixels );\
367 uint64_t b= LD64(pixels+1);\
368 l1= (a&0x0303030303030303ULL)\
369 + (b&0x0303030303030303ULL);\
370 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
371 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
372 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
373 pixels+=line_size;\
374 block +=line_size;\
375 a= LD64(pixels );\
376 b= LD64(pixels+1);\
377 l0= (a&0x0303030303030303ULL)\
378 + (b&0x0303030303030303ULL)\
379 + 0x0101010101010101ULL;\
380 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
381 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
382 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
383 pixels+=line_size;\
384 block +=line_size;\
385 }\
386}\
387\
388void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
389 OPNAME ## _pixels,\
390 OPNAME ## _pixels_x2,\
391 OPNAME ## _pixels_y2,\
392 OPNAME ## _pixels_xy2,\
393};\
394\
395void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
396 OPNAME ## _pixels,\
397 OPNAME ## _no_rnd_pixels_x2,\
398 OPNAME ## _no_rnd_pixels_y2,\
399 OPNAME ## _no_rnd_pixels_xy2,\
400};
401
402#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
403#else // 64 bit variant
404
405#define PIXOP2(OPNAME, OP) \
406void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
407{\
408 int i;\
409 for(i=0; i<h; i++){\
410 OP(*((uint32_t*)(block )), LD32(pixels ));\
411 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
412 pixels+=line_size;\
413 block +=line_size;\
414 }\
415}\
416\
417void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
418{\
419 int i;\
420 for(i=0; i<h; i++){\
421 int j;\
422 for(j=0; j<2; j++){\
423 const uint32_t a= LD32(pixels );\
424 const uint32_t b= LD32(pixels+1);\
425 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
426 pixels+=4;\
427 block +=4;\
428 }\
429 pixels+=line_size-8;\
430 block +=line_size-8;\
431 }\
432}\
433\
434void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
435{\
436 int i;\
437 for(i=0; i<h; i++){\
438 int j;\
439 for(j=0; j<2; j++){\
440 const uint32_t a= LD32(pixels );\
441 const uint32_t b= LD32(pixels+1);\
442 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
443 pixels+=4;\
444 block +=4;\
445 }\
446 pixels+=line_size-8;\
447 block +=line_size-8;\
448 }\
449}\
450\
451void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
452{\
453 int i;\
454 for(i=0; i<h; i++){\
455 int j;\
456 for(j=0; j<2; j++){\
457 const uint32_t a= LD32(pixels );\
458 const uint32_t b= LD32(pixels+line_size);\
459 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
460 pixels+=4;\
461 block +=4;\
462 }\
463 pixels+=line_size-8;\
464 block +=line_size-8;\
465 }\
466}\
467\
468void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
469{\
470 int i;\
471 for(i=0; i<h; i++){\
472 int j;\
473 for(j=0; j<2; j++){\
474 const uint32_t a= LD32(pixels );\
475 const uint32_t b= LD32(pixels+line_size);\
476 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
477 pixels+=4;\
478 block +=4;\
479 }\
480 pixels+=line_size-8;\
481 block +=line_size-8;\
482 }\
483}\
484\
485void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
486{\
487 int j;\
488 for(j=0; j<2; j++){\
489 int i;\
490 const uint32_t a= LD32(pixels );\
491 const uint32_t b= LD32(pixels+1);\
492 uint32_t l0= (a&0x03030303UL)\
493 + (b&0x03030303UL)\
494 + 0x02020202UL;\
495 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
496 + ((b&0xFCFCFCFCUL)>>2);\
497 uint32_t l1,h1;\
498\
499 pixels+=line_size;\
500 for(i=0; i<h; i+=2){\
501 uint32_t a= LD32(pixels );\
502 uint32_t b= LD32(pixels+1);\
503 l1= (a&0x03030303UL)\
504 + (b&0x03030303UL);\
505 h1= ((a&0xFCFCFCFCUL)>>2)\
506 + ((b&0xFCFCFCFCUL)>>2);\
507 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
508 pixels+=line_size;\
509 block +=line_size;\
510 a= LD32(pixels );\
511 b= LD32(pixels+1);\
512 l0= (a&0x03030303UL)\
513 + (b&0x03030303UL)\
514 + 0x02020202UL;\
515 h0= ((a&0xFCFCFCFCUL)>>2)\
516 + ((b&0xFCFCFCFCUL)>>2);\
517 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
518 pixels+=line_size;\
519 block +=line_size;\
520 }\
521 pixels+=4-line_size*(h+1);\
522 block +=4-line_size*h;\
523 }\
524}\
525\
526void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
527{\
528 int j;\
529 for(j=0; j<2; j++){\
530 int i;\
531 const uint32_t a= LD32(pixels );\
532 const uint32_t b= LD32(pixels+1);\
533 uint32_t l0= (a&0x03030303UL)\
534 + (b&0x03030303UL)\
535 + 0x01010101UL;\
536 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
537 + ((b&0xFCFCFCFCUL)>>2);\
538 uint32_t l1,h1;\
539\
540 pixels+=line_size;\
541 for(i=0; i<h; i+=2){\
542 uint32_t a= LD32(pixels );\
543 uint32_t b= LD32(pixels+1);\
544 l1= (a&0x03030303UL)\
545 + (b&0x03030303UL);\
546 h1= ((a&0xFCFCFCFCUL)>>2)\
547 + ((b&0xFCFCFCFCUL)>>2);\
548 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
549 pixels+=line_size;\
550 block +=line_size;\
551 a= LD32(pixels );\
552 b= LD32(pixels+1);\
553 l0= (a&0x03030303UL)\
554 + (b&0x03030303UL)\
555 + 0x01010101UL;\
556 h0= ((a&0xFCFCFCFCUL)>>2)\
557 + ((b&0xFCFCFCFCUL)>>2);\
558 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
559 pixels+=line_size;\
560 block +=line_size;\
561 }\
562 pixels+=4-line_size*(h+1);\
563 block +=4-line_size*h;\
564 }\
565}\
566\
567void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
568 OPNAME ## _pixels,\
569 OPNAME ## _pixels_x2,\
570 OPNAME ## _pixels_y2,\
571 OPNAME ## _pixels_xy2,\
572};\
573\
574void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
575 OPNAME ## _pixels,\
576 OPNAME ## _no_rnd_pixels_x2,\
577 OPNAME ## _no_rnd_pixels_y2,\
578 OPNAME ## _no_rnd_pixels_xy2,\
579};
580#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
581#endif
582
583#define op_put(a, b) a = b
584
585PIXOP2(avg, op_avg)
586PIXOP2(put, op_put)
587#undef op_avg
588#undef op_put
589
57060b1e 590#if 0
59fe111e 591/* FIXME this stuff could be removed as its ot really used anymore */
de6d9b64
FB
592#define PIXOP(BTYPE, OPNAME, OP, INCR) \
593 \
594static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
595{ \
596 BTYPE *p; \
597 const UINT8 *pix; \
598 \
599 p = block; \
600 pix = pixels; \
601 do { \
602 OP(p[0], pix[0]); \
603 OP(p[1], pix[1]); \
604 OP(p[2], pix[2]); \
605 OP(p[3], pix[3]); \
606 OP(p[4], pix[4]); \
607 OP(p[5], pix[5]); \
608 OP(p[6], pix[6]); \
609 OP(p[7], pix[7]); \
610 pix += line_size; \
611 p += INCR; \
612 } while (--h);; \
613} \
614 \
615static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
616{ \
617 BTYPE *p; \
618 const UINT8 *pix; \
619 \
620 p = block; \
621 pix = pixels; \
622 do { \
623 OP(p[0], avg2(pix[0], pix[1])); \
624 OP(p[1], avg2(pix[1], pix[2])); \
625 OP(p[2], avg2(pix[2], pix[3])); \
626 OP(p[3], avg2(pix[3], pix[4])); \
627 OP(p[4], avg2(pix[4], pix[5])); \
628 OP(p[5], avg2(pix[5], pix[6])); \
629 OP(p[6], avg2(pix[6], pix[7])); \
630 OP(p[7], avg2(pix[7], pix[8])); \
631 pix += line_size; \
632 p += INCR; \
633 } while (--h); \
634} \
635 \
636static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
637{ \
638 BTYPE *p; \
639 const UINT8 *pix; \
640 const UINT8 *pix1; \
641 \
642 p = block; \
643 pix = pixels; \
644 pix1 = pixels + line_size; \
645 do { \
646 OP(p[0], avg2(pix[0], pix1[0])); \
647 OP(p[1], avg2(pix[1], pix1[1])); \
648 OP(p[2], avg2(pix[2], pix1[2])); \
649 OP(p[3], avg2(pix[3], pix1[3])); \
650 OP(p[4], avg2(pix[4], pix1[4])); \
651 OP(p[5], avg2(pix[5], pix1[5])); \
652 OP(p[6], avg2(pix[6], pix1[6])); \
653 OP(p[7], avg2(pix[7], pix1[7])); \
654 pix += line_size; \
655 pix1 += line_size; \
656 p += INCR; \
657 } while(--h); \
658} \
659 \
660static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
661{ \
662 BTYPE *p; \
663 const UINT8 *pix; \
664 const UINT8 *pix1; \
665 \
666 p = block; \
667 pix = pixels; \
668 pix1 = pixels + line_size; \
669 do { \
670 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
671 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
672 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
673 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
674 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
675 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
676 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
677 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
678 pix += line_size; \
679 pix1 += line_size; \
680 p += INCR; \
681 } while(--h); \
682} \
683 \
684void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
685 OPNAME ## _pixels, \
686 OPNAME ## _pixels_x2, \
687 OPNAME ## _pixels_y2, \
688 OPNAME ## _pixels_xy2, \
689};
690
de6d9b64
FB
691/* rounding primitives */
692#define avg2(a,b) ((a+b+1)>>1)
693#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
694
de6d9b64
FB
695#define op_avg(a, b) a = avg2(a, b)
696#define op_sub(a, b) a -= b
697
de6d9b64
FB
698PIXOP(DCTELEM, sub, op_sub, 8)
699
700/* not rounding primitives */
701#undef avg2
702#undef avg4
703#define avg2(a,b) ((a+b)>>1)
704#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
705
de6d9b64
FB
706/* motion estimation */
707
708#undef avg2
709#undef avg4
57060b1e
FB
710#endif
711
de6d9b64
FB
712#define avg2(a,b) ((a+b+1)>>1)
713#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
714
44eb4951
MN
715static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
716{
717 const int A=(16-x16)*(16-y16);
718 const int B=( x16)*(16-y16);
719 const int C=(16-x16)*( y16);
720 const int D=( x16)*( y16);
721 int i;
722 rounder= 128 - rounder;
723
724 for(i=0; i<h; i++)
725 {
726 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
727 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
728 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
729 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
730 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
731 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
732 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
733 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
734 dst+= srcStride;
735 src+= srcStride;
736 }
737}
738
739static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
740{
741 UINT8 *cm = cropTbl + MAX_NEG_CROP;
742 int i;
743 for(i=0; i<h; i++)
744 {
ba6802de
MN
745 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
746 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
747 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
748 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
749 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
750 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
751 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
752 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
44eb4951
MN
753 dst+=dstStride;
754 src+=srcStride;
755 }
756}
757
758static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
759{
760 UINT8 *cm = cropTbl + MAX_NEG_CROP;
761 int i;
762 for(i=0; i<w; i++)
763 {
764 const int src0= src[0*srcStride];
765 const int src1= src[1*srcStride];
766 const int src2= src[2*srcStride];
767 const int src3= src[3*srcStride];
768 const int src4= src[4*srcStride];
769 const int src5= src[5*srcStride];
770 const int src6= src[6*srcStride];
771 const int src7= src[7*srcStride];
772 const int src8= src[8*srcStride];
ba6802de
MN
773 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
774 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
775 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
776 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
777 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
778 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
779 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
780 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
44eb4951
MN
781 dst++;
782 src++;
783 }
784}
785
786static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
787{
788 int i;
789 for(i=0; i<8; i++)
790 {
791 dst[0]= src[0];
792 dst[1]= src[1];
793 dst[2]= src[2];
794 dst[3]= src[3];
795 dst[4]= src[4];
796 dst[5]= src[5];
797 dst[6]= src[6];
798 dst[7]= src[7];
799 dst+=dstStride;
800 src+=srcStride;
801 }
802}
803
804static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
805{
806 int i;
807 for(i=0; i<8; i++)
808 {
809 dst[0]= (src1[0] + src2[0] + r)>>1;
810 dst[1]= (src1[1] + src2[1] + r)>>1;
811 dst[2]= (src1[2] + src2[2] + r)>>1;
812 dst[3]= (src1[3] + src2[3] + r)>>1;
813 dst[4]= (src1[4] + src2[4] + r)>>1;
814 dst[5]= (src1[5] + src2[5] + r)>>1;
815 dst[6]= (src1[6] + src2[6] + r)>>1;
816 dst[7]= (src1[7] + src2[7] + r)>>1;
817 dst+=dstStride;
818 src1+=srcStride;
819 src2+=8;
820 }
821}
822
823static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
824{
825 int i;
826 for(i=0; i<8; i++)
827 {
828 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
829 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
830 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
831 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
832 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
833 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
834 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
835 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
836 dst+=dstStride;
837 src1+=srcStride;
838 src2+=8;
7ff037e9 839 src3+=8;
44eb4951
MN
840 src4+=8;
841 }
842}
843
844#define QPEL_MC(r, name) \
845static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
846{\
847 put_block(dst, src, dstStride, srcStride);\
848}\
849\
850static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
851{\
852 UINT8 half[64];\
ba6802de 853 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
854 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
855}\
856\
857static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
858{\
ba6802de 859 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
44eb4951
MN
860}\
861\
862static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
863{\
864 UINT8 half[64];\
ba6802de 865 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
866 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
867}\
868\
869static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
870{\
871 UINT8 half[64];\
ba6802de 872 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
873 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
874}\
875\
876static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
877{\
ba6802de 878 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
44eb4951
MN
879}\
880\
881static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
882{\
883 UINT8 half[64];\
ba6802de 884 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
885 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
886}\
887static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
888{\
889 UINT8 halfH[72];\
7ff037e9 890 UINT8 halfV[64];\
44eb4951 891 UINT8 halfHV[64];\
ba6802de
MN
892 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
893 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
894 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
895 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
896}\
897static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
898{\
899 UINT8 halfH[72];\
7ff037e9 900 UINT8 halfV[64];\
44eb4951 901 UINT8 halfHV[64];\
ba6802de
MN
902 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
903 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
904 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
905 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
906}\
907static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
908{\
909 UINT8 halfH[72];\
7ff037e9 910 UINT8 halfV[64];\
44eb4951 911 UINT8 halfHV[64];\
ba6802de
MN
912 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
913 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
914 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 915 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
44eb4951
MN
916}\
917static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
918{\
919 UINT8 halfH[72];\
7ff037e9 920 UINT8 halfV[64];\
44eb4951 921 UINT8 halfHV[64];\
ba6802de
MN
922 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
923 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
924 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 925 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
44eb4951
MN
926}\
927static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
928{\
929 UINT8 halfH[72];\
930 UINT8 halfHV[64];\
ba6802de
MN
931 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
932 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
933 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
934}\
935static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
936{\
937 UINT8 halfH[72];\
938 UINT8 halfHV[64];\
ba6802de
MN
939 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
940 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
941 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
942}\
943static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
944{\
945 UINT8 halfH[72];\
7ff037e9 946 UINT8 halfV[64];\
44eb4951 947 UINT8 halfHV[64];\
ba6802de
MN
948 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
949 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
950 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 951 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
44eb4951
MN
952}\
953static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
954{\
955 UINT8 halfH[72];\
7ff037e9 956 UINT8 halfV[64];\
44eb4951 957 UINT8 halfHV[64];\
ba6802de
MN
958 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
959 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
960 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 961 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
44eb4951
MN
962}\
963static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
964{\
965 UINT8 halfH[72];\
ba6802de
MN
966 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
967 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
44eb4951
MN
968}\
969qpel_mc_func qpel_mc ## name ## _tab[16]={ \
970 qpel_mc00_c ## name, \
971 qpel_mc10_c ## name, \
972 qpel_mc20_c ## name, \
973 qpel_mc30_c ## name, \
974 qpel_mc01_c ## name, \
975 qpel_mc11_c ## name, \
976 qpel_mc21_c ## name, \
977 qpel_mc31_c ## name, \
978 qpel_mc02_c ## name, \
979 qpel_mc12_c ## name, \
980 qpel_mc22_c ## name, \
981 qpel_mc32_c ## name, \
982 qpel_mc03_c ## name, \
983 qpel_mc13_c ## name, \
984 qpel_mc23_c ## name, \
985 qpel_mc33_c ## name, \
986};
987
988QPEL_MC(0, _rnd)
989QPEL_MC(1, _no_rnd)
990
ba6802de 991int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
992{
993 int s, i;
994
995 s = 0;
ba6802de 996 for(i=0;i<16;i++) {
de6d9b64
FB
997 s += abs(pix1[0] - pix2[0]);
998 s += abs(pix1[1] - pix2[1]);
999 s += abs(pix1[2] - pix2[2]);
1000 s += abs(pix1[3] - pix2[3]);
1001 s += abs(pix1[4] - pix2[4]);
1002 s += abs(pix1[5] - pix2[5]);
1003 s += abs(pix1[6] - pix2[6]);
1004 s += abs(pix1[7] - pix2[7]);
1005 s += abs(pix1[8] - pix2[8]);
1006 s += abs(pix1[9] - pix2[9]);
1007 s += abs(pix1[10] - pix2[10]);
1008 s += abs(pix1[11] - pix2[11]);
1009 s += abs(pix1[12] - pix2[12]);
1010 s += abs(pix1[13] - pix2[13]);
1011 s += abs(pix1[14] - pix2[14]);
1012 s += abs(pix1[15] - pix2[15]);
1013 pix1 += line_size;
1014 pix2 += line_size;
1015 }
1016 return s;
1017}
1018
ba6802de 1019int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1020{
1021 int s, i;
1022
1023 s = 0;
ba6802de 1024 for(i=0;i<16;i++) {
de6d9b64
FB
1025 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1026 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1027 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1028 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1029 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1030 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1031 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1032 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1033 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1034 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1035 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1036 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1037 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1038 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1039 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1040 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1041 pix1 += line_size;
1042 pix2 += line_size;
1043 }
1044 return s;
1045}
1046
ba6802de 1047int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1048{
1049 int s, i;
1050 UINT8 *pix3 = pix2 + line_size;
1051
1052 s = 0;
ba6802de 1053 for(i=0;i<16;i++) {
de6d9b64
FB
1054 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1055 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1056 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1057 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1058 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1059 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1060 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1061 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1062 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1063 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1064 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1065 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1066 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1067 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1068 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1069 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1070 pix1 += line_size;
1071 pix2 += line_size;
1072 pix3 += line_size;
1073 }
1074 return s;
1075}
1076
ba6802de 1077int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1078{
1079 int s, i;
1080 UINT8 *pix3 = pix2 + line_size;
1081
1082 s = 0;
ba6802de 1083 for(i=0;i<16;i++) {
de6d9b64
FB
1084 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1085 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1086 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1087 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1088 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1089 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1090 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1091 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1092 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1093 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1094 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1095 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1096 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1097 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1098 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1099 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1100 pix1 += line_size;
1101 pix2 += line_size;
1102 pix3 += line_size;
1103 }
1104 return s;
1105}
1106
ba6802de
MN
1107int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1108{
1109 int s, i;
1110
1111 s = 0;
1112 for(i=0;i<8;i++) {
1113 s += abs(pix1[0] - pix2[0]);
1114 s += abs(pix1[1] - pix2[1]);
1115 s += abs(pix1[2] - pix2[2]);
1116 s += abs(pix1[3] - pix2[3]);
1117 s += abs(pix1[4] - pix2[4]);
1118 s += abs(pix1[5] - pix2[5]);
1119 s += abs(pix1[6] - pix2[6]);
1120 s += abs(pix1[7] - pix2[7]);
1121 pix1 += line_size;
1122 pix2 += line_size;
1123 }
1124 return s;
1125}
1126
1127int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1128{
1129 int s, i;
1130
1131 s = 0;
1132 for(i=0;i<8;i++) {
1133 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1134 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1135 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1136 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1137 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1138 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1139 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1140 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1141 pix1 += line_size;
1142 pix2 += line_size;
1143 }
1144 return s;
1145}
1146
1147int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1148{
1149 int s, i;
1150 UINT8 *pix3 = pix2 + line_size;
1151
1152 s = 0;
1153 for(i=0;i<8;i++) {
1154 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1155 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1156 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1157 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1158 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1159 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1160 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1161 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1162 pix1 += line_size;
1163 pix2 += line_size;
1164 pix3 += line_size;
1165 }
1166 return s;
1167}
1168
1169int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1170{
1171 int s, i;
1172 UINT8 *pix3 = pix2 + line_size;
1173
1174 s = 0;
1175 for(i=0;i<8;i++) {
1176 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1177 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1178 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1179 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1180 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1181 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1182 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1183 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1184 pix1 += line_size;
1185 pix2 += line_size;
1186 pix3 += line_size;
1187 }
1188 return s;
1189}
1190
e0eac44e
FB
1191/* permute block according so that it corresponds to the MMX idct
1192 order */
d962f6fd 1193#ifdef SIMPLE_IDCT
5a240838 1194 /* general permutation, but perhaps slightly slower */
d962f6fd
A
1195void block_permute(INT16 *block)
1196{
1197 int i;
1198 INT16 temp[64];
1199
d962f6fd
A
1200 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1201
1202 for(i=0; i<64; i++) block[i] = temp[i];
d962f6fd 1203}
d962f6fd
A
1204#else
1205
e0eac44e 1206void block_permute(INT16 *block)
de6d9b64 1207{
e0eac44e 1208 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
de6d9b64
FB
1209 int i;
1210
e0eac44e
FB
1211 for(i=0;i<8;i++) {
1212 tmp1 = block[1];
1213 tmp2 = block[2];
1214 tmp3 = block[3];
1215 tmp4 = block[4];
1216 tmp5 = block[5];
1217 tmp6 = block[6];
1218 block[1] = tmp2;
1219 block[2] = tmp4;
1220 block[3] = tmp6;
1221 block[4] = tmp1;
1222 block[5] = tmp3;
1223 block[6] = tmp5;
1224 block += 8;
1225 }
1226}
d962f6fd 1227#endif
e0eac44e 1228
649c00c9
MN
1229void clear_blocks_c(DCTELEM *blocks)
1230{
1231 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1232}
1233
8ee14970
FB
1234/* XXX: those functions should be suppressed ASAP when all IDCTs are
1235 converted */
1236void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1237{
1238 ff_idct (block);
1239 put_pixels_clamped(block, dest, line_size);
1240}
1241
1242void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1243{
1244 ff_idct (block);
1245 add_pixels_clamped(block, dest, line_size);
1246}
1247
e0eac44e
FB
1248void dsputil_init(void)
1249{
1250 int i, j;
c34270f5 1251 int use_permuted_idct;
e0eac44e 1252
de6d9b64
FB
1253 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1254 for(i=0;i<MAX_NEG_CROP;i++) {
1255 cropTbl[i] = 0;
1256 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1257 }
1258
1259 for(i=0;i<512;i++) {
1260 squareTbl[i] = (i - 256) * (i - 256);
1261 }
1262
d962f6fd 1263#ifdef SIMPLE_IDCT
8ee14970 1264 ff_idct = NULL;
d962f6fd 1265#else
4af7bcc1 1266 ff_idct = j_rev_dct;
d962f6fd 1267#endif
de6d9b64 1268 get_pixels = get_pixels_c;
9dbcbd92 1269 diff_pixels = diff_pixels_c;
de6d9b64
FB
1270 put_pixels_clamped = put_pixels_clamped_c;
1271 add_pixels_clamped = add_pixels_clamped_c;
44eb4951 1272 gmc1= gmc1_c;
649c00c9 1273 clear_blocks= clear_blocks_c;
de6d9b64 1274
ba6802de
MN
1275 pix_abs16x16 = pix_abs16x16_c;
1276 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1277 pix_abs16x16_y2 = pix_abs16x16_y2_c;
de6d9b64 1278 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
ba6802de
MN
1279 pix_abs8x8 = pix_abs8x8_c;
1280 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1281 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1282 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
03c94ede 1283 av_fdct = fdct_ifast;
de6d9b64 1284
c34270f5 1285 use_permuted_idct = 1;
e0eac44e 1286
980fc7b8 1287#ifdef HAVE_MMX
de6d9b64
FB
1288 dsputil_init_mmx();
1289#endif
3d03c0a2
FB
1290#ifdef ARCH_ARMV4L
1291 dsputil_init_armv4l();
1292#endif
c34270f5
FB
1293#ifdef HAVE_MLIB
1294 dsputil_init_mlib();
1295 use_permuted_idct = 0;
1296#endif
1e98dffb
NK
1297#ifdef ARCH_ALPHA
1298 dsputil_init_alpha();
1299 use_permuted_idct = 0;
1300#endif
c34270f5 1301
d962f6fd 1302#ifdef SIMPLE_IDCT
8ee14970
FB
1303 if (ff_idct == NULL) {
1304 ff_idct_put = simple_idct_put;
1305 ff_idct_add = simple_idct_add;
1306 use_permuted_idct=0;
1307 } else {
1308 ff_idct_put = gen_idct_put;
1309 ff_idct_add = gen_idct_add;
1310 }
d962f6fd
A
1311#endif
1312
5a240838
MN
1313 if(use_permuted_idct)
1314#ifdef SIMPLE_IDCT
1315 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1316#else
1317 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1318#endif
1319 else
1320 for(i=0; i<64; i++) permutation[i]=i;
1321
2f349de2
MN
1322 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1323 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1324
c34270f5
FB
1325 if (use_permuted_idct) {
1326 /* permute for IDCT */
1327 for(i=0;i<64;i++) {
1328 j = zigzag_direct[i];
1329 zigzag_direct[i] = block_permute_op(j);
1330 j = ff_alternate_horizontal_scan[i];
1331 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1332 j = ff_alternate_vertical_scan[i];
1333 ff_alternate_vertical_scan[i] = block_permute_op(j);
1334 }
1335 block_permute(default_intra_matrix);
1336 block_permute(default_non_intra_matrix);
3bf43d42
MN
1337 block_permute(ff_mpeg4_default_intra_matrix);
1338 block_permute(ff_mpeg4_default_non_intra_matrix);
c34270f5 1339 }
badaf88e
MN
1340
1341 build_zigzag_end();
de6d9b64 1342}
43f1708f 1343
57060b1e
FB
1344/* remove any non bit exact operation (testing purpose) */
1345void avcodec_set_bit_exact(void)
1346{
1347#ifdef HAVE_MMX
1348 dsputil_set_bit_exact_mmx();
1349#endif
1350}
1351
43f1708f
J
1352void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1353 int orig_linesize[3], int coded_linesize,
1354 AVCodecContext *avctx)
1355{
1356 int quad, diff, x, y;
1357 UINT8 *orig, *coded;
1358 UINT32 *sq = squareTbl + 256;
1359
1360 quad = 0;
1361 diff = 0;
1362
1363 /* Luminance */
1364 orig = orig_image[0];
1365 coded = coded_image[0];
1366
1367 for (y=0;y<avctx->height;y++) {
1368 for (x=0;x<avctx->width;x++) {
1369 diff = *(orig + x) - *(coded + x);
1370 quad += sq[diff];
1371 }
1372 orig += orig_linesize[0];
1373 coded += coded_linesize;
1374 }
1375
1376 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1377
1378 if (avctx->psnr_y) {
1379 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1380 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1381 } else
1382 avctx->psnr_y = 99.99;
1383}
1384