reducing sizeof MpegEncContext to avoid stack overflow on crap M$ windo$
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
de6d9b64
FB
21#include "avcodec.h"
22#include "dsputil.h"
d962f6fd 23#include "simple_idct.h"
de6d9b64 24
4af7bcc1 25void (*ff_idct)(DCTELEM *block);
8ee14970
FB
26void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
de6d9b64 28void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
9dbcbd92 29void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
de6d9b64
FB
30void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
31void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
44eb4951 32void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
649c00c9 33void (*clear_blocks)(DCTELEM *blocks);
3aa102be
MN
34int (*pix_sum)(UINT8 * pix, int line_size);
35int (*pix_norm1)(UINT8 * pix, int line_size);
de6d9b64
FB
36
37op_pixels_abs_func pix_abs16x16;
38op_pixels_abs_func pix_abs16x16_x2;
39op_pixels_abs_func pix_abs16x16_y2;
40op_pixels_abs_func pix_abs16x16_xy2;
41
ba6802de
MN
42op_pixels_abs_func pix_abs8x8;
43op_pixels_abs_func pix_abs8x8_x2;
44op_pixels_abs_func pix_abs8x8_y2;
45op_pixels_abs_func pix_abs8x8_xy2;
46
0cfa9713 47UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64
FB
48UINT32 squareTbl[512];
49
adc09b2e
MK
50extern INT16 ff_mpeg1_default_intra_matrix[64];
51extern INT16 ff_mpeg1_default_non_intra_matrix[64];
f0ca2e1b
ZK
52extern INT16 ff_mpeg4_default_intra_matrix[64];
53extern INT16 ff_mpeg4_default_non_intra_matrix[64];
e0eac44e
FB
54
55UINT8 zigzag_direct[64] = {
56 0, 1, 8, 16, 9, 2, 3, 10,
57 17, 24, 32, 25, 18, 11, 4, 5,
58 12, 19, 26, 33, 40, 48, 41, 34,
59 27, 20, 13, 6, 7, 14, 21, 28,
60 35, 42, 49, 56, 57, 50, 43, 36,
61 29, 22, 15, 23, 30, 37, 44, 51,
62 58, 59, 52, 45, 38, 31, 39, 46,
63 53, 60, 61, 54, 47, 55, 62, 63
64};
65
2f349de2
MN
66/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
67UINT16 __align8 inv_zigzag_direct16[64];
68
69/* not permutated zigzag_direct for MMX quantizer */
70UINT8 zigzag_direct_noperm[64];
71
e0eac44e
FB
72UINT8 ff_alternate_horizontal_scan[64] = {
73 0, 1, 2, 3, 8, 9, 16, 17,
74 10, 11, 4, 5, 6, 7, 15, 14,
75 13, 12, 19, 18, 24, 25, 32, 33,
76 26, 27, 20, 21, 22, 23, 28, 29,
77 30, 31, 34, 35, 40, 41, 48, 49,
78 42, 43, 36, 37, 38, 39, 44, 45,
79 46, 47, 50, 51, 56, 57, 58, 59,
80 52, 53, 54, 55, 60, 61, 62, 63,
81};
82
83UINT8 ff_alternate_vertical_scan[64] = {
84 0, 8, 16, 24, 1, 9, 2, 10,
85 17, 25, 32, 40, 48, 56, 57, 49,
86 41, 33, 26, 18, 3, 11, 4, 12,
87 19, 27, 34, 42, 50, 58, 35, 43,
88 51, 59, 20, 28, 5, 13, 6, 14,
89 21, 29, 36, 44, 52, 60, 37, 45,
90 53, 61, 22, 30, 7, 15, 23, 31,
91 38, 46, 54, 62, 39, 47, 55, 63,
92};
93
e4986da9
J
94#ifdef SIMPLE_IDCT
95
0a8d8945 96/* Input permutation for the simple_idct_mmx */
5a240838 97static UINT8 simple_mmx_permutation[64]={
0a8d8945
MN
98 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
99 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
100 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
101 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
102 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
103 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
104 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
105 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
5a240838 106};
e4986da9 107#endif
5a240838 108
2f349de2
MN
109/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
110UINT32 inverse[256]={
111 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
112 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
113 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
114 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
115 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
116 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
117 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
118 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
119 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
120 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
121 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
122 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
123 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
124 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
125 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
126 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
127 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
128 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
129 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
130 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
131 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
132 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
133 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
134 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
135 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
136 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
137 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
138 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
139 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
140 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
141 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
142 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
143};
144
badaf88e
MN
145/* used to skip zeros at the end */
146UINT8 zigzag_end[64];
147
5a240838
MN
148UINT8 permutation[64];
149//UINT8 invPermutation[64];
150
20695ec9 151static void build_zigzag_end(void)
badaf88e
MN
152{
153 int lastIndex;
154 int lastIndexAfterPerm=0;
155 for(lastIndex=0; lastIndex<64; lastIndex++)
156 {
157 if(zigzag_direct[lastIndex] > lastIndexAfterPerm)
158 lastIndexAfterPerm= zigzag_direct[lastIndex];
159 zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
160 }
161}
162
3aa102be
MN
163int pix_sum_c(UINT8 * pix, int line_size)
164{
165 int s, i, j;
166
167 s = 0;
168 for (i = 0; i < 16; i++) {
169 for (j = 0; j < 16; j += 8) {
170 s += pix[0];
171 s += pix[1];
172 s += pix[2];
173 s += pix[3];
174 s += pix[4];
175 s += pix[5];
176 s += pix[6];
177 s += pix[7];
178 pix += 8;
179 }
180 pix += line_size - 16;
181 }
182 return s;
183}
184
185int pix_norm1_c(UINT8 * pix, int line_size)
186{
187 int s, i, j;
188 UINT32 *sq = squareTbl + 256;
189
190 s = 0;
191 for (i = 0; i < 16; i++) {
192 for (j = 0; j < 16; j += 8) {
193 s += sq[pix[0]];
194 s += sq[pix[1]];
195 s += sq[pix[2]];
196 s += sq[pix[3]];
197 s += sq[pix[4]];
198 s += sq[pix[5]];
199 s += sq[pix[6]];
200 s += sq[pix[7]];
201 pix += 8;
202 }
203 pix += line_size - 16;
204 }
205 return s;
206}
207
208
c13e1abd 209void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
de6d9b64 210{
de6d9b64
FB
211 int i;
212
213 /* read the pixels */
de6d9b64 214 for(i=0;i<8;i++) {
c13e1abd
FH
215 block[0] = pixels[0];
216 block[1] = pixels[1];
217 block[2] = pixels[2];
218 block[3] = pixels[3];
219 block[4] = pixels[4];
220 block[5] = pixels[5];
221 block[6] = pixels[6];
222 block[7] = pixels[7];
223 pixels += line_size;
224 block += 8;
de6d9b64
FB
225 }
226}
227
c13e1abd
FH
228void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
229 int stride){
9dbcbd92
MN
230 int i;
231
232 /* read the pixels */
9dbcbd92 233 for(i=0;i<8;i++) {
c13e1abd
FH
234 block[0] = s1[0] - s2[0];
235 block[1] = s1[1] - s2[1];
236 block[2] = s1[2] - s2[2];
237 block[3] = s1[3] - s2[3];
238 block[4] = s1[4] - s2[4];
239 block[5] = s1[5] - s2[5];
240 block[6] = s1[6] - s2[6];
241 block[7] = s1[7] - s2[7];
9dbcbd92
MN
242 s1 += stride;
243 s2 += stride;
c13e1abd 244 block += 8;
9dbcbd92
MN
245 }
246}
247
248
c13e1abd
FH
249void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
250 int line_size)
de6d9b64 251{
de6d9b64
FB
252 int i;
253 UINT8 *cm = cropTbl + MAX_NEG_CROP;
254
255 /* read the pixels */
de6d9b64 256 for(i=0;i<8;i++) {
c13e1abd
FH
257 pixels[0] = cm[block[0]];
258 pixels[1] = cm[block[1]];
259 pixels[2] = cm[block[2]];
260 pixels[3] = cm[block[3]];
261 pixels[4] = cm[block[4]];
262 pixels[5] = cm[block[5]];
263 pixels[6] = cm[block[6]];
264 pixels[7] = cm[block[7]];
265
266 pixels += line_size;
267 block += 8;
de6d9b64
FB
268 }
269}
270
c13e1abd
FH
271void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
272 int line_size)
de6d9b64 273{
de6d9b64
FB
274 int i;
275 UINT8 *cm = cropTbl + MAX_NEG_CROP;
276
277 /* read the pixels */
de6d9b64 278 for(i=0;i<8;i++) {
c13e1abd
FH
279 pixels[0] = cm[pixels[0] + block[0]];
280 pixels[1] = cm[pixels[1] + block[1]];
281 pixels[2] = cm[pixels[2] + block[2]];
282 pixels[3] = cm[pixels[3] + block[3]];
283 pixels[4] = cm[pixels[4] + block[4]];
284 pixels[5] = cm[pixels[5] + block[5]];
285 pixels[6] = cm[pixels[6] + block[6]];
286 pixels[7] = cm[pixels[7] + block[7]];
287 pixels += line_size;
288 block += 8;
de6d9b64
FB
289 }
290}
59fe111e
MN
291#if 0
292
293#define PIXOP2(OPNAME, OP) \
294void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
295{\
296 int i;\
297 for(i=0; i<h; i++){\
298 OP(*((uint64_t*)block), LD64(pixels));\
299 pixels+=line_size;\
300 block +=line_size;\
301 }\
302}\
303\
304void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
305{\
306 int i;\
307 for(i=0; i<h; i++){\
308 const uint64_t a= LD64(pixels );\
309 const uint64_t b= LD64(pixels+1);\
310 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
311 pixels+=line_size;\
312 block +=line_size;\
313 }\
314}\
315\
316void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
317{\
318 int i;\
319 for(i=0; i<h; i++){\
320 const uint64_t a= LD64(pixels );\
321 const uint64_t b= LD64(pixels+1);\
322 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
323 pixels+=line_size;\
324 block +=line_size;\
325 }\
326}\
327\
328void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
329{\
330 int i;\
331 for(i=0; i<h; i++){\
332 const uint64_t a= LD64(pixels );\
333 const uint64_t b= LD64(pixels+line_size);\
334 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
335 pixels+=line_size;\
336 block +=line_size;\
337 }\
338}\
339\
340void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
341{\
342 int i;\
343 for(i=0; i<h; i++){\
344 const uint64_t a= LD64(pixels );\
345 const uint64_t b= LD64(pixels+line_size);\
346 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
347 pixels+=line_size;\
348 block +=line_size;\
349 }\
350}\
351\
352void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
353{\
354 int i;\
355 const uint64_t a= LD64(pixels );\
356 const uint64_t b= LD64(pixels+1);\
357 uint64_t l0= (a&0x0303030303030303ULL)\
358 + (b&0x0303030303030303ULL)\
359 + 0x0202020202020202ULL;\
360 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
361 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
362 uint64_t l1,h1;\
363\
364 pixels+=line_size;\
365 for(i=0; i<h; i+=2){\
366 uint64_t a= LD64(pixels );\
367 uint64_t b= LD64(pixels+1);\
368 l1= (a&0x0303030303030303ULL)\
369 + (b&0x0303030303030303ULL);\
370 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
371 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
372 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
373 pixels+=line_size;\
374 block +=line_size;\
375 a= LD64(pixels );\
376 b= LD64(pixels+1);\
377 l0= (a&0x0303030303030303ULL)\
378 + (b&0x0303030303030303ULL)\
379 + 0x0202020202020202ULL;\
380 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
381 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
382 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
383 pixels+=line_size;\
384 block +=line_size;\
385 }\
386}\
387\
388void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
389{\
390 int i;\
391 const uint64_t a= LD64(pixels );\
392 const uint64_t b= LD64(pixels+1);\
393 uint64_t l0= (a&0x0303030303030303ULL)\
394 + (b&0x0303030303030303ULL)\
395 + 0x0101010101010101ULL;\
396 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
397 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
398 uint64_t l1,h1;\
399\
400 pixels+=line_size;\
401 for(i=0; i<h; i+=2){\
402 uint64_t a= LD64(pixels );\
403 uint64_t b= LD64(pixels+1);\
404 l1= (a&0x0303030303030303ULL)\
405 + (b&0x0303030303030303ULL);\
406 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
407 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
408 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
409 pixels+=line_size;\
410 block +=line_size;\
411 a= LD64(pixels );\
412 b= LD64(pixels+1);\
413 l0= (a&0x0303030303030303ULL)\
414 + (b&0x0303030303030303ULL)\
415 + 0x0101010101010101ULL;\
416 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
417 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
418 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
419 pixels+=line_size;\
420 block +=line_size;\
421 }\
422}\
423\
424void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
425 OPNAME ## _pixels,\
426 OPNAME ## _pixels_x2,\
427 OPNAME ## _pixels_y2,\
428 OPNAME ## _pixels_xy2,\
429};\
430\
431void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
432 OPNAME ## _pixels,\
433 OPNAME ## _no_rnd_pixels_x2,\
434 OPNAME ## _no_rnd_pixels_y2,\
435 OPNAME ## _no_rnd_pixels_xy2,\
436};
437
438#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
439#else // 64 bit variant
440
441#define PIXOP2(OPNAME, OP) \
442void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
443{\
444 int i;\
445 for(i=0; i<h; i++){\
446 OP(*((uint32_t*)(block )), LD32(pixels ));\
447 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
448 pixels+=line_size;\
449 block +=line_size;\
450 }\
451}\
452\
453void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
454{\
455 int i;\
456 for(i=0; i<h; i++){\
457 int j;\
458 for(j=0; j<2; j++){\
459 const uint32_t a= LD32(pixels );\
460 const uint32_t b= LD32(pixels+1);\
461 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
462 pixels+=4;\
463 block +=4;\
464 }\
465 pixels+=line_size-8;\
466 block +=line_size-8;\
467 }\
468}\
469\
470void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
471{\
472 int i;\
473 for(i=0; i<h; i++){\
474 int j;\
475 for(j=0; j<2; j++){\
476 const uint32_t a= LD32(pixels );\
477 const uint32_t b= LD32(pixels+1);\
478 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
479 pixels+=4;\
480 block +=4;\
481 }\
482 pixels+=line_size-8;\
483 block +=line_size-8;\
484 }\
485}\
486\
487void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
488{\
489 int i;\
490 for(i=0; i<h; i++){\
491 int j;\
492 for(j=0; j<2; j++){\
493 const uint32_t a= LD32(pixels );\
494 const uint32_t b= LD32(pixels+line_size);\
495 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
496 pixels+=4;\
497 block +=4;\
498 }\
499 pixels+=line_size-8;\
500 block +=line_size-8;\
501 }\
502}\
503\
504void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
505{\
506 int i;\
507 for(i=0; i<h; i++){\
508 int j;\
509 for(j=0; j<2; j++){\
510 const uint32_t a= LD32(pixels );\
511 const uint32_t b= LD32(pixels+line_size);\
512 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
513 pixels+=4;\
514 block +=4;\
515 }\
516 pixels+=line_size-8;\
517 block +=line_size-8;\
518 }\
519}\
520\
521void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
522{\
523 int j;\
524 for(j=0; j<2; j++){\
525 int i;\
526 const uint32_t a= LD32(pixels );\
527 const uint32_t b= LD32(pixels+1);\
528 uint32_t l0= (a&0x03030303UL)\
529 + (b&0x03030303UL)\
530 + 0x02020202UL;\
531 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
532 + ((b&0xFCFCFCFCUL)>>2);\
533 uint32_t l1,h1;\
534\
535 pixels+=line_size;\
536 for(i=0; i<h; i+=2){\
537 uint32_t a= LD32(pixels );\
538 uint32_t b= LD32(pixels+1);\
539 l1= (a&0x03030303UL)\
540 + (b&0x03030303UL);\
541 h1= ((a&0xFCFCFCFCUL)>>2)\
542 + ((b&0xFCFCFCFCUL)>>2);\
543 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
544 pixels+=line_size;\
545 block +=line_size;\
546 a= LD32(pixels );\
547 b= LD32(pixels+1);\
548 l0= (a&0x03030303UL)\
549 + (b&0x03030303UL)\
550 + 0x02020202UL;\
551 h0= ((a&0xFCFCFCFCUL)>>2)\
552 + ((b&0xFCFCFCFCUL)>>2);\
553 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
554 pixels+=line_size;\
555 block +=line_size;\
556 }\
557 pixels+=4-line_size*(h+1);\
558 block +=4-line_size*h;\
559 }\
560}\
561\
562void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
563{\
564 int j;\
565 for(j=0; j<2; j++){\
566 int i;\
567 const uint32_t a= LD32(pixels );\
568 const uint32_t b= LD32(pixels+1);\
569 uint32_t l0= (a&0x03030303UL)\
570 + (b&0x03030303UL)\
571 + 0x01010101UL;\
572 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
573 + ((b&0xFCFCFCFCUL)>>2);\
574 uint32_t l1,h1;\
575\
576 pixels+=line_size;\
577 for(i=0; i<h; i+=2){\
578 uint32_t a= LD32(pixels );\
579 uint32_t b= LD32(pixels+1);\
580 l1= (a&0x03030303UL)\
581 + (b&0x03030303UL);\
582 h1= ((a&0xFCFCFCFCUL)>>2)\
583 + ((b&0xFCFCFCFCUL)>>2);\
584 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
585 pixels+=line_size;\
586 block +=line_size;\
587 a= LD32(pixels );\
588 b= LD32(pixels+1);\
589 l0= (a&0x03030303UL)\
590 + (b&0x03030303UL)\
591 + 0x01010101UL;\
592 h0= ((a&0xFCFCFCFCUL)>>2)\
593 + ((b&0xFCFCFCFCUL)>>2);\
594 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
595 pixels+=line_size;\
596 block +=line_size;\
597 }\
598 pixels+=4-line_size*(h+1);\
599 block +=4-line_size*h;\
600 }\
601}\
602\
603void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
604 OPNAME ## _pixels,\
605 OPNAME ## _pixels_x2,\
606 OPNAME ## _pixels_y2,\
607 OPNAME ## _pixels_xy2,\
608};\
609\
610void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
611 OPNAME ## _pixels,\
612 OPNAME ## _no_rnd_pixels_x2,\
613 OPNAME ## _no_rnd_pixels_y2,\
614 OPNAME ## _no_rnd_pixels_xy2,\
615};
616#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
617#endif
59fe111e
MN
618#define op_put(a, b) a = b
619
620PIXOP2(avg, op_avg)
621PIXOP2(put, op_put)
622#undef op_avg
623#undef op_put
624
57060b1e 625#if 0
59fe111e 626/* FIXME this stuff could be removed as its ot really used anymore */
de6d9b64
FB
627#define PIXOP(BTYPE, OPNAME, OP, INCR) \
628 \
629static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
630{ \
631 BTYPE *p; \
632 const UINT8 *pix; \
633 \
634 p = block; \
635 pix = pixels; \
636 do { \
637 OP(p[0], pix[0]); \
638 OP(p[1], pix[1]); \
639 OP(p[2], pix[2]); \
640 OP(p[3], pix[3]); \
641 OP(p[4], pix[4]); \
642 OP(p[5], pix[5]); \
643 OP(p[6], pix[6]); \
644 OP(p[7], pix[7]); \
645 pix += line_size; \
646 p += INCR; \
647 } while (--h);; \
648} \
649 \
650static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
651{ \
652 BTYPE *p; \
653 const UINT8 *pix; \
654 \
655 p = block; \
656 pix = pixels; \
657 do { \
658 OP(p[0], avg2(pix[0], pix[1])); \
659 OP(p[1], avg2(pix[1], pix[2])); \
660 OP(p[2], avg2(pix[2], pix[3])); \
661 OP(p[3], avg2(pix[3], pix[4])); \
662 OP(p[4], avg2(pix[4], pix[5])); \
663 OP(p[5], avg2(pix[5], pix[6])); \
664 OP(p[6], avg2(pix[6], pix[7])); \
665 OP(p[7], avg2(pix[7], pix[8])); \
666 pix += line_size; \
667 p += INCR; \
668 } while (--h); \
669} \
670 \
671static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
672{ \
673 BTYPE *p; \
674 const UINT8 *pix; \
675 const UINT8 *pix1; \
676 \
677 p = block; \
678 pix = pixels; \
679 pix1 = pixels + line_size; \
680 do { \
681 OP(p[0], avg2(pix[0], pix1[0])); \
682 OP(p[1], avg2(pix[1], pix1[1])); \
683 OP(p[2], avg2(pix[2], pix1[2])); \
684 OP(p[3], avg2(pix[3], pix1[3])); \
685 OP(p[4], avg2(pix[4], pix1[4])); \
686 OP(p[5], avg2(pix[5], pix1[5])); \
687 OP(p[6], avg2(pix[6], pix1[6])); \
688 OP(p[7], avg2(pix[7], pix1[7])); \
689 pix += line_size; \
690 pix1 += line_size; \
691 p += INCR; \
692 } while(--h); \
693} \
694 \
695static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
696{ \
697 BTYPE *p; \
698 const UINT8 *pix; \
699 const UINT8 *pix1; \
700 \
701 p = block; \
702 pix = pixels; \
703 pix1 = pixels + line_size; \
704 do { \
705 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
706 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
707 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
708 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
709 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
710 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
711 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
712 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
713 pix += line_size; \
714 pix1 += line_size; \
715 p += INCR; \
716 } while(--h); \
717} \
718 \
719void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
720 OPNAME ## _pixels, \
721 OPNAME ## _pixels_x2, \
722 OPNAME ## _pixels_y2, \
723 OPNAME ## _pixels_xy2, \
724};
725
de6d9b64
FB
726/* rounding primitives */
727#define avg2(a,b) ((a+b+1)>>1)
728#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
729
de6d9b64
FB
730#define op_avg(a, b) a = avg2(a, b)
731#define op_sub(a, b) a -= b
3aa102be 732#define op_put(a, b) a = b
de6d9b64 733
de6d9b64 734PIXOP(DCTELEM, sub, op_sub, 8)
3aa102be
MN
735PIXOP(uint8_t, avg, op_avg, line_size)
736PIXOP(uint8_t, put, op_put, line_size)
de6d9b64
FB
737
738/* not rounding primitives */
739#undef avg2
740#undef avg4
741#define avg2(a,b) ((a+b)>>1)
742#define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
743
3aa102be
MN
744PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
745PIXOP(uint8_t, put_no_rnd, op_put, line_size)
de6d9b64
FB
746/* motion estimation */
747
748#undef avg2
749#undef avg4
57060b1e
FB
750#endif
751
de6d9b64
FB
752#define avg2(a,b) ((a+b+1)>>1)
753#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
754
44eb4951
MN
755static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
756{
757 const int A=(16-x16)*(16-y16);
758 const int B=( x16)*(16-y16);
759 const int C=(16-x16)*( y16);
760 const int D=( x16)*( y16);
761 int i;
762 rounder= 128 - rounder;
763
764 for(i=0; i<h; i++)
765 {
766 dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
767 dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
768 dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
769 dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
770 dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
771 dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
772 dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
773 dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
774 dst+= srcStride;
775 src+= srcStride;
776 }
777}
778
779static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
780{
781 UINT8 *cm = cropTbl + MAX_NEG_CROP;
782 int i;
783 for(i=0; i<h; i++)
784 {
ba6802de
MN
785 dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
786 dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
787 dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
788 dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
789 dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
790 dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
791 dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
792 dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
44eb4951
MN
793 dst+=dstStride;
794 src+=srcStride;
795 }
796}
797
798static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
799{
800 UINT8 *cm = cropTbl + MAX_NEG_CROP;
801 int i;
802 for(i=0; i<w; i++)
803 {
804 const int src0= src[0*srcStride];
805 const int src1= src[1*srcStride];
806 const int src2= src[2*srcStride];
807 const int src3= src[3*srcStride];
808 const int src4= src[4*srcStride];
809 const int src5= src[5*srcStride];
810 const int src6= src[6*srcStride];
811 const int src7= src[7*srcStride];
812 const int src8= src[8*srcStride];
ba6802de
MN
813 dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
814 dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
815 dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
816 dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
817 dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
818 dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
819 dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
820 dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
44eb4951
MN
821 dst++;
822 src++;
823 }
824}
825
826static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
827{
828 int i;
829 for(i=0; i<8; i++)
830 {
831 dst[0]= src[0];
832 dst[1]= src[1];
833 dst[2]= src[2];
834 dst[3]= src[3];
835 dst[4]= src[4];
836 dst[5]= src[5];
837 dst[6]= src[6];
838 dst[7]= src[7];
839 dst+=dstStride;
840 src+=srcStride;
841 }
842}
843
844static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
845{
846 int i;
847 for(i=0; i<8; i++)
848 {
849 dst[0]= (src1[0] + src2[0] + r)>>1;
850 dst[1]= (src1[1] + src2[1] + r)>>1;
851 dst[2]= (src1[2] + src2[2] + r)>>1;
852 dst[3]= (src1[3] + src2[3] + r)>>1;
853 dst[4]= (src1[4] + src2[4] + r)>>1;
854 dst[5]= (src1[5] + src2[5] + r)>>1;
855 dst[6]= (src1[6] + src2[6] + r)>>1;
856 dst[7]= (src1[7] + src2[7] + r)>>1;
857 dst+=dstStride;
858 src1+=srcStride;
859 src2+=8;
860 }
861}
862
863static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
864{
865 int i;
866 for(i=0; i<8; i++)
867 {
868 dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
869 dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
870 dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
871 dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
872 dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
873 dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
874 dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
875 dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
876 dst+=dstStride;
877 src1+=srcStride;
878 src2+=8;
7ff037e9 879 src3+=8;
44eb4951
MN
880 src4+=8;
881 }
882}
883
884#define QPEL_MC(r, name) \
885static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
886{\
887 put_block(dst, src, dstStride, srcStride);\
888}\
889\
890static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
891{\
892 UINT8 half[64];\
ba6802de 893 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
894 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
895}\
896\
897static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
898{\
ba6802de 899 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
44eb4951
MN
900}\
901\
902static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
903{\
904 UINT8 half[64];\
ba6802de 905 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
906 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
907}\
908\
909static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
910{\
911 UINT8 half[64];\
ba6802de 912 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
913 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
914}\
915\
916static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
917{\
ba6802de 918 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
44eb4951
MN
919}\
920\
921static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
922{\
923 UINT8 half[64];\
ba6802de 924 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
44eb4951
MN
925 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
926}\
927static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
928{\
929 UINT8 halfH[72];\
7ff037e9 930 UINT8 halfV[64];\
44eb4951 931 UINT8 halfHV[64];\
ba6802de
MN
932 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
933 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
934 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
935 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
936}\
937static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
938{\
939 UINT8 halfH[72];\
7ff037e9 940 UINT8 halfV[64];\
44eb4951 941 UINT8 halfHV[64];\
ba6802de
MN
942 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
943 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
944 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
945 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
946}\
947static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
948{\
949 UINT8 halfH[72];\
7ff037e9 950 UINT8 halfV[64];\
44eb4951 951 UINT8 halfHV[64];\
ba6802de
MN
952 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
953 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
954 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 955 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
44eb4951
MN
956}\
957static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
958{\
959 UINT8 halfH[72];\
7ff037e9 960 UINT8 halfV[64];\
44eb4951 961 UINT8 halfHV[64];\
ba6802de
MN
962 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
963 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
964 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 965 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
44eb4951
MN
966}\
967static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
968{\
969 UINT8 halfH[72];\
970 UINT8 halfHV[64];\
ba6802de
MN
971 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
972 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
973 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
974}\
975static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
976{\
977 UINT8 halfH[72];\
978 UINT8 halfHV[64];\
ba6802de
MN
979 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
980 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
44eb4951
MN
981 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
982}\
983static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
984{\
985 UINT8 halfH[72];\
7ff037e9 986 UINT8 halfV[64];\
44eb4951 987 UINT8 halfHV[64];\
ba6802de
MN
988 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
989 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
990 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 991 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
44eb4951
MN
992}\
993static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
994{\
995 UINT8 halfH[72];\
7ff037e9 996 UINT8 halfV[64];\
44eb4951 997 UINT8 halfHV[64];\
ba6802de
MN
998 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
999 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
1000 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
7ff037e9 1001 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
44eb4951
MN
1002}\
1003static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
1004{\
1005 UINT8 halfH[72];\
ba6802de
MN
1006 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
1007 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
44eb4951
MN
1008}\
1009qpel_mc_func qpel_mc ## name ## _tab[16]={ \
1010 qpel_mc00_c ## name, \
1011 qpel_mc10_c ## name, \
1012 qpel_mc20_c ## name, \
1013 qpel_mc30_c ## name, \
1014 qpel_mc01_c ## name, \
1015 qpel_mc11_c ## name, \
1016 qpel_mc21_c ## name, \
1017 qpel_mc31_c ## name, \
1018 qpel_mc02_c ## name, \
1019 qpel_mc12_c ## name, \
1020 qpel_mc22_c ## name, \
1021 qpel_mc32_c ## name, \
1022 qpel_mc03_c ## name, \
1023 qpel_mc13_c ## name, \
1024 qpel_mc23_c ## name, \
1025 qpel_mc33_c ## name, \
1026};
1027
1028QPEL_MC(0, _rnd)
1029QPEL_MC(1, _no_rnd)
1030
ba6802de 1031int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1032{
1033 int s, i;
1034
1035 s = 0;
ba6802de 1036 for(i=0;i<16;i++) {
de6d9b64
FB
1037 s += abs(pix1[0] - pix2[0]);
1038 s += abs(pix1[1] - pix2[1]);
1039 s += abs(pix1[2] - pix2[2]);
1040 s += abs(pix1[3] - pix2[3]);
1041 s += abs(pix1[4] - pix2[4]);
1042 s += abs(pix1[5] - pix2[5]);
1043 s += abs(pix1[6] - pix2[6]);
1044 s += abs(pix1[7] - pix2[7]);
1045 s += abs(pix1[8] - pix2[8]);
1046 s += abs(pix1[9] - pix2[9]);
1047 s += abs(pix1[10] - pix2[10]);
1048 s += abs(pix1[11] - pix2[11]);
1049 s += abs(pix1[12] - pix2[12]);
1050 s += abs(pix1[13] - pix2[13]);
1051 s += abs(pix1[14] - pix2[14]);
1052 s += abs(pix1[15] - pix2[15]);
1053 pix1 += line_size;
1054 pix2 += line_size;
1055 }
1056 return s;
1057}
1058
ba6802de 1059int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1060{
1061 int s, i;
1062
1063 s = 0;
ba6802de 1064 for(i=0;i<16;i++) {
de6d9b64
FB
1065 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1066 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1067 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1068 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1069 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1070 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1071 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1072 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1073 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1074 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1075 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1076 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1077 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1078 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1079 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1080 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1081 pix1 += line_size;
1082 pix2 += line_size;
1083 }
1084 return s;
1085}
1086
ba6802de 1087int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1088{
1089 int s, i;
1090 UINT8 *pix3 = pix2 + line_size;
1091
1092 s = 0;
ba6802de 1093 for(i=0;i<16;i++) {
de6d9b64
FB
1094 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1095 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1096 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1097 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1098 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1099 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1100 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1101 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1102 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1103 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1104 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1105 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1106 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1107 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1108 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1109 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1110 pix1 += line_size;
1111 pix2 += line_size;
1112 pix3 += line_size;
1113 }
1114 return s;
1115}
1116
ba6802de 1117int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
de6d9b64
FB
1118{
1119 int s, i;
1120 UINT8 *pix3 = pix2 + line_size;
1121
1122 s = 0;
ba6802de 1123 for(i=0;i<16;i++) {
de6d9b64
FB
1124 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1125 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1126 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1127 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1128 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1129 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1130 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1131 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1132 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1133 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1134 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1135 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1136 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1137 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1138 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1139 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1140 pix1 += line_size;
1141 pix2 += line_size;
1142 pix3 += line_size;
1143 }
1144 return s;
1145}
1146
ba6802de
MN
1147int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1148{
1149 int s, i;
1150
1151 s = 0;
1152 for(i=0;i<8;i++) {
1153 s += abs(pix1[0] - pix2[0]);
1154 s += abs(pix1[1] - pix2[1]);
1155 s += abs(pix1[2] - pix2[2]);
1156 s += abs(pix1[3] - pix2[3]);
1157 s += abs(pix1[4] - pix2[4]);
1158 s += abs(pix1[5] - pix2[5]);
1159 s += abs(pix1[6] - pix2[6]);
1160 s += abs(pix1[7] - pix2[7]);
1161 pix1 += line_size;
1162 pix2 += line_size;
1163 }
1164 return s;
1165}
1166
1167int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1168{
1169 int s, i;
1170
1171 s = 0;
1172 for(i=0;i<8;i++) {
1173 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1174 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1175 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1176 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1177 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1178 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1179 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1180 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1181 pix1 += line_size;
1182 pix2 += line_size;
1183 }
1184 return s;
1185}
1186
1187int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1188{
1189 int s, i;
1190 UINT8 *pix3 = pix2 + line_size;
1191
1192 s = 0;
1193 for(i=0;i<8;i++) {
1194 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1195 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1196 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1197 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1198 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1199 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1200 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1201 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1202 pix1 += line_size;
1203 pix2 += line_size;
1204 pix3 += line_size;
1205 }
1206 return s;
1207}
1208
1209int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1210{
1211 int s, i;
1212 UINT8 *pix3 = pix2 + line_size;
1213
1214 s = 0;
1215 for(i=0;i<8;i++) {
1216 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1217 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1218 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1219 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1220 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1221 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1222 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1223 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1224 pix1 += line_size;
1225 pix2 += line_size;
1226 pix3 += line_size;
1227 }
1228 return s;
1229}
1230
e0eac44e
FB
1231/* permute block according so that it corresponds to the MMX idct
1232 order */
d962f6fd 1233#ifdef SIMPLE_IDCT
5a240838 1234 /* general permutation, but perhaps slightly slower */
d962f6fd
A
1235void block_permute(INT16 *block)
1236{
1237 int i;
1238 INT16 temp[64];
1239
d962f6fd
A
1240 for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1241
1242 for(i=0; i<64; i++) block[i] = temp[i];
d962f6fd 1243}
d962f6fd
A
1244#else
1245
e0eac44e 1246void block_permute(INT16 *block)
de6d9b64 1247{
e0eac44e 1248 int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
de6d9b64
FB
1249 int i;
1250
e0eac44e
FB
1251 for(i=0;i<8;i++) {
1252 tmp1 = block[1];
1253 tmp2 = block[2];
1254 tmp3 = block[3];
1255 tmp4 = block[4];
1256 tmp5 = block[5];
1257 tmp6 = block[6];
1258 block[1] = tmp2;
1259 block[2] = tmp4;
1260 block[3] = tmp6;
1261 block[4] = tmp1;
1262 block[5] = tmp3;
1263 block[6] = tmp5;
1264 block += 8;
1265 }
1266}
d962f6fd 1267#endif
e0eac44e 1268
649c00c9
MN
1269void clear_blocks_c(DCTELEM *blocks)
1270{
1271 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1272}
1273
8ee14970
FB
1274/* XXX: those functions should be suppressed ASAP when all IDCTs are
1275 converted */
1276void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1277{
1278 ff_idct (block);
1279 put_pixels_clamped(block, dest, line_size);
1280}
1281
1282void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1283{
1284 ff_idct (block);
1285 add_pixels_clamped(block, dest, line_size);
1286}
1287
e0eac44e
FB
1288void dsputil_init(void)
1289{
1290 int i, j;
c34270f5 1291 int use_permuted_idct;
e0eac44e 1292
de6d9b64
FB
1293 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1294 for(i=0;i<MAX_NEG_CROP;i++) {
1295 cropTbl[i] = 0;
1296 cropTbl[i + MAX_NEG_CROP + 256] = 255;
1297 }
1298
1299 for(i=0;i<512;i++) {
1300 squareTbl[i] = (i - 256) * (i - 256);
1301 }
1302
d962f6fd 1303#ifdef SIMPLE_IDCT
8ee14970 1304 ff_idct = NULL;
d962f6fd 1305#else
4af7bcc1 1306 ff_idct = j_rev_dct;
d962f6fd 1307#endif
de6d9b64 1308 get_pixels = get_pixels_c;
9dbcbd92 1309 diff_pixels = diff_pixels_c;
de6d9b64
FB
1310 put_pixels_clamped = put_pixels_clamped_c;
1311 add_pixels_clamped = add_pixels_clamped_c;
44eb4951 1312 gmc1= gmc1_c;
649c00c9 1313 clear_blocks= clear_blocks_c;
3aa102be
MN
1314 pix_sum= pix_sum_c;
1315 pix_norm1= pix_norm1_c;
de6d9b64 1316
ba6802de
MN
1317 pix_abs16x16 = pix_abs16x16_c;
1318 pix_abs16x16_x2 = pix_abs16x16_x2_c;
1319 pix_abs16x16_y2 = pix_abs16x16_y2_c;
de6d9b64 1320 pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
ba6802de
MN
1321 pix_abs8x8 = pix_abs8x8_c;
1322 pix_abs8x8_x2 = pix_abs8x8_x2_c;
1323 pix_abs8x8_y2 = pix_abs8x8_y2_c;
1324 pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
de6d9b64 1325
c34270f5 1326 use_permuted_idct = 1;
e0eac44e 1327
980fc7b8 1328#ifdef HAVE_MMX
de6d9b64
FB
1329 dsputil_init_mmx();
1330#endif
3d03c0a2
FB
1331#ifdef ARCH_ARMV4L
1332 dsputil_init_armv4l();
1333#endif
c34270f5
FB
1334#ifdef HAVE_MLIB
1335 dsputil_init_mlib();
1336 use_permuted_idct = 0;
1337#endif
1e98dffb
NK
1338#ifdef ARCH_ALPHA
1339 dsputil_init_alpha();
1340 use_permuted_idct = 0;
1341#endif
59925ef2 1342#ifdef ARCH_POWERPC
ab6c65f6 1343 dsputil_init_ppc();
a43bd1d7 1344#endif
c34270f5 1345
d962f6fd 1346#ifdef SIMPLE_IDCT
8ee14970
FB
1347 if (ff_idct == NULL) {
1348 ff_idct_put = simple_idct_put;
1349 ff_idct_add = simple_idct_add;
1350 use_permuted_idct=0;
fc2bb4f4
MN
1351 }
1352#endif
1353 if(ff_idct != NULL) {
8ee14970
FB
1354 ff_idct_put = gen_idct_put;
1355 ff_idct_add = gen_idct_add;
1356 }
d962f6fd 1357
5a240838
MN
1358 if(use_permuted_idct)
1359#ifdef SIMPLE_IDCT
1360 for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1361#else
1362 for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1363#endif
1364 else
1365 for(i=0; i<64; i++) permutation[i]=i;
1366
2f349de2
MN
1367 for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1368 for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1369
c34270f5
FB
1370 if (use_permuted_idct) {
1371 /* permute for IDCT */
1372 for(i=0;i<64;i++) {
1373 j = zigzag_direct[i];
1374 zigzag_direct[i] = block_permute_op(j);
1375 j = ff_alternate_horizontal_scan[i];
1376 ff_alternate_horizontal_scan[i] = block_permute_op(j);
1377 j = ff_alternate_vertical_scan[i];
1378 ff_alternate_vertical_scan[i] = block_permute_op(j);
1379 }
adc09b2e
MK
1380 block_permute(ff_mpeg1_default_intra_matrix);
1381 block_permute(ff_mpeg1_default_non_intra_matrix);
3bf43d42
MN
1382 block_permute(ff_mpeg4_default_intra_matrix);
1383 block_permute(ff_mpeg4_default_non_intra_matrix);
c34270f5 1384 }
badaf88e
MN
1385
1386 build_zigzag_end();
de6d9b64 1387}
43f1708f 1388
57060b1e
FB
1389/* remove any non bit exact operation (testing purpose) */
1390void avcodec_set_bit_exact(void)
1391{
1392#ifdef HAVE_MMX
1393 dsputil_set_bit_exact_mmx();
1394#endif
1395}
1396
43f1708f
J
1397void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1398 int orig_linesize[3], int coded_linesize,
1399 AVCodecContext *avctx)
1400{
1401 int quad, diff, x, y;
1402 UINT8 *orig, *coded;
1403 UINT32 *sq = squareTbl + 256;
1404
1405 quad = 0;
1406 diff = 0;
1407
1408 /* Luminance */
1409 orig = orig_image[0];
1410 coded = coded_image[0];
1411
1412 for (y=0;y<avctx->height;y++) {
1413 for (x=0;x<avctx->width;x++) {
1414 diff = *(orig + x) - *(coded + x);
1415 quad += sq[diff];
1416 }
1417 orig += orig_linesize[0];
1418 coded += coded_linesize;
1419 }
1420
1421 avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1422
1423 if (avctx->psnr_y) {
1424 avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1425 avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y);
1426 } else
1427 avctx->psnr_y = 99.99;
1428}
1429