3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23 #include "simple_idct.h"
25 void (*ff_idct
)(DCTELEM
*block
);
26 void (*ff_idct_put
)(UINT8
*dest
, int line_size
, DCTELEM
*block
);
27 void (*ff_idct_add
)(UINT8
*dest
, int line_size
, DCTELEM
*block
);
28 void (*av_fdct
)(DCTELEM
*block
);
29 void (*get_pixels
)(DCTELEM
*block
, const UINT8
*pixels
, int line_size
);
30 void (*diff_pixels
)(DCTELEM
*block
, const UINT8
*s1
, const UINT8
*s2
, int stride
);
31 void (*put_pixels_clamped
)(const DCTELEM
*block
, UINT8
*pixels
, int line_size
);
32 void (*add_pixels_clamped
)(const DCTELEM
*block
, UINT8
*pixels
, int line_size
);
33 void (*gmc1
)(UINT8
*dst
, UINT8
*src
, int srcStride
, int h
, int x16
, int y16
, int rounder
);
34 void (*clear_blocks
)(DCTELEM
*blocks
);
36 op_pixels_abs_func pix_abs16x16
;
37 op_pixels_abs_func pix_abs16x16_x2
;
38 op_pixels_abs_func pix_abs16x16_y2
;
39 op_pixels_abs_func pix_abs16x16_xy2
;
41 op_pixels_abs_func pix_abs8x8
;
42 op_pixels_abs_func pix_abs8x8_x2
;
43 op_pixels_abs_func pix_abs8x8_y2
;
44 op_pixels_abs_func pix_abs8x8_xy2
;
46 UINT8 cropTbl
[256 + 2 * MAX_NEG_CROP
];
47 UINT32 squareTbl
[512];
49 extern INT16 default_intra_matrix
[64];
50 extern INT16 default_non_intra_matrix
[64];
51 extern INT16 ff_mpeg4_default_intra_matrix
[64];
52 extern INT16 ff_mpeg4_default_non_intra_matrix
[64];
54 UINT8 zigzag_direct
[64] = {
55 0, 1, 8, 16, 9, 2, 3, 10,
56 17, 24, 32, 25, 18, 11, 4, 5,
57 12, 19, 26, 33, 40, 48, 41, 34,
58 27, 20, 13, 6, 7, 14, 21, 28,
59 35, 42, 49, 56, 57, 50, 43, 36,
60 29, 22, 15, 23, 30, 37, 44, 51,
61 58, 59, 52, 45, 38, 31, 39, 46,
62 53, 60, 61, 54, 47, 55, 62, 63
65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66 UINT16 __align8 inv_zigzag_direct16
[64];
68 /* not permutated zigzag_direct for MMX quantizer */
69 UINT8 zigzag_direct_noperm
[64];
71 UINT8 ff_alternate_horizontal_scan
[64] = {
72 0, 1, 2, 3, 8, 9, 16, 17,
73 10, 11, 4, 5, 6, 7, 15, 14,
74 13, 12, 19, 18, 24, 25, 32, 33,
75 26, 27, 20, 21, 22, 23, 28, 29,
76 30, 31, 34, 35, 40, 41, 48, 49,
77 42, 43, 36, 37, 38, 39, 44, 45,
78 46, 47, 50, 51, 56, 57, 58, 59,
79 52, 53, 54, 55, 60, 61, 62, 63,
82 UINT8 ff_alternate_vertical_scan
[64] = {
83 0, 8, 16, 24, 1, 9, 2, 10,
84 17, 25, 32, 40, 48, 56, 57, 49,
85 41, 33, 26, 18, 3, 11, 4, 12,
86 19, 27, 34, 42, 50, 58, 35, 43,
87 51, 59, 20, 28, 5, 13, 6, 14,
88 21, 29, 36, 44, 52, 60, 37, 45,
89 53, 61, 22, 30, 7, 15, 23, 31,
90 38, 46, 54, 62, 39, 47, 55, 63,
95 /* Input permutation for the simple_idct_mmx */
96 static UINT8 simple_mmx_permutation
[64]={
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
108 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
109 UINT32 inverse
[256]={
110 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
111 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
112 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
113 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
114 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
115 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
116 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
117 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
118 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
119 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
120 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
121 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
122 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
123 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
124 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
125 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
126 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
127 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
128 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
129 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
130 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
131 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
132 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
133 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
134 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
135 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
136 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
137 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
138 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
139 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
140 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
141 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
144 /* used to skip zeros at the end */
145 UINT8 zigzag_end
[64];
147 UINT8 permutation
[64];
148 //UINT8 invPermutation[64];
150 static void build_zigzag_end(void)
153 int lastIndexAfterPerm
=0;
154 for(lastIndex
=0; lastIndex
<64; lastIndex
++)
156 if(zigzag_direct
[lastIndex
] > lastIndexAfterPerm
)
157 lastIndexAfterPerm
= zigzag_direct
[lastIndex
];
158 zigzag_end
[lastIndex
]= lastIndexAfterPerm
+ 1;
162 void get_pixels_c(DCTELEM
*block
, const UINT8
*pixels
, int line_size
)
168 /* read the pixels */
185 void diff_pixels_c(DCTELEM
*block
, const UINT8
*s1
, const UINT8
*s2
, int stride
){
189 /* read the pixels */
192 p
[0] = s1
[0] - s2
[0];
193 p
[1] = s1
[1] - s2
[1];
194 p
[2] = s1
[2] - s2
[2];
195 p
[3] = s1
[3] - s2
[3];
196 p
[4] = s1
[4] - s2
[4];
197 p
[5] = s1
[5] - s2
[5];
198 p
[6] = s1
[6] - s2
[6];
199 p
[7] = s1
[7] - s2
[7];
207 void put_pixels_clamped_c(const DCTELEM
*block
, UINT8
*pixels
, int line_size
)
212 UINT8
*cm
= cropTbl
+ MAX_NEG_CROP
;
214 /* read the pixels */
231 void add_pixels_clamped_c(const DCTELEM
*block
, UINT8
*pixels
, int line_size
)
236 UINT8
*cm
= cropTbl
+ MAX_NEG_CROP
;
238 /* read the pixels */
242 pix
[0] = cm
[pix
[0] + p
[0]];
243 pix
[1] = cm
[pix
[1] + p
[1]];
244 pix
[2] = cm
[pix
[2] + p
[2]];
245 pix
[3] = cm
[pix
[3] + p
[3]];
246 pix
[4] = cm
[pix
[4] + p
[4]];
247 pix
[5] = cm
[pix
[5] + p
[5]];
248 pix
[6] = cm
[pix
[6] + p
[6]];
249 pix
[7] = cm
[pix
[7] + p
[7]];
257 struct unaligned_64
{ uint64_t l
; } __attribute__((packed
));
258 struct unaligned_32
{ uint32_t l
; } __attribute__((packed
));
260 #define LD32(a) (((const struct unaligned_32 *) (a))->l)
261 #define LD64(a) (((const struct unaligned_64 *) (a))->l)
265 #define LD32(a) (*((uint32_t*)(a)))
266 #define LD64(a) (*((uint64_t*)(a)))
268 #endif /* !__GNUC__ */
272 #define PIXOP2(OPNAME, OP) \
273 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
277 OP(*((uint64_t*)block), LD64(pixels));\
283 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
287 const uint64_t a= LD64(pixels );\
288 const uint64_t b= LD64(pixels+1);\
289 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
295 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
299 const uint64_t a= LD64(pixels );\
300 const uint64_t b= LD64(pixels+1);\
301 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
307 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
311 const uint64_t a= LD64(pixels );\
312 const uint64_t b= LD64(pixels+line_size);\
313 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
319 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
323 const uint64_t a= LD64(pixels );\
324 const uint64_t b= LD64(pixels+line_size);\
325 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
331 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
334 const uint64_t a= LD64(pixels );\
335 const uint64_t b= LD64(pixels+1);\
336 uint64_t l0= (a&0x0303030303030303ULL)\
337 + (b&0x0303030303030303ULL)\
338 + 0x0202020202020202ULL;\
339 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
340 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
344 for(i=0; i<h; i+=2){\
345 uint64_t a= LD64(pixels );\
346 uint64_t b= LD64(pixels+1);\
347 l1= (a&0x0303030303030303ULL)\
348 + (b&0x0303030303030303ULL);\
349 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
350 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
351 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
356 l0= (a&0x0303030303030303ULL)\
357 + (b&0x0303030303030303ULL)\
358 + 0x0202020202020202ULL;\
359 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
360 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
361 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
367 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
370 const uint64_t a= LD64(pixels );\
371 const uint64_t b= LD64(pixels+1);\
372 uint64_t l0= (a&0x0303030303030303ULL)\
373 + (b&0x0303030303030303ULL)\
374 + 0x0101010101010101ULL;\
375 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
376 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
380 for(i=0; i<h; i+=2){\
381 uint64_t a= LD64(pixels );\
382 uint64_t b= LD64(pixels+1);\
383 l1= (a&0x0303030303030303ULL)\
384 + (b&0x0303030303030303ULL);\
385 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
386 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
387 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
392 l0= (a&0x0303030303030303ULL)\
393 + (b&0x0303030303030303ULL)\
394 + 0x0101010101010101ULL;\
395 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
396 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
397 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
403 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
405 OPNAME ## _pixels_x2,\
406 OPNAME ## _pixels_y2,\
407 OPNAME ## _pixels_xy2,\
410 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
412 OPNAME ## _no_rnd_pixels_x2,\
413 OPNAME ## _no_rnd_pixels_y2,\
414 OPNAME ## _no_rnd_pixels_xy2,\
417 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
418 #else // 64 bit variant
420 #define PIXOP2(OPNAME, OP) \
421 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
425 OP(*((uint32_t*)(block )), LD32(pixels ));\
426 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
432 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
438 const uint32_t a= LD32(pixels );\
439 const uint32_t b= LD32(pixels+1);\
440 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
444 pixels+=line_size-8;\
445 block +=line_size-8;\
449 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
455 const uint32_t a= LD32(pixels );\
456 const uint32_t b= LD32(pixels+1);\
457 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
461 pixels+=line_size-8;\
462 block +=line_size-8;\
466 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
472 const uint32_t a= LD32(pixels );\
473 const uint32_t b= LD32(pixels+line_size);\
474 OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
478 pixels+=line_size-8;\
479 block +=line_size-8;\
483 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
489 const uint32_t a= LD32(pixels );\
490 const uint32_t b= LD32(pixels+line_size);\
491 OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
495 pixels+=line_size-8;\
496 block +=line_size-8;\
500 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
505 const uint32_t a= LD32(pixels );\
506 const uint32_t b= LD32(pixels+1);\
507 uint32_t l0= (a&0x03030303UL)\
510 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
511 + ((b&0xFCFCFCFCUL)>>2);\
515 for(i=0; i<h; i+=2){\
516 uint32_t a= LD32(pixels );\
517 uint32_t b= LD32(pixels+1);\
518 l1= (a&0x03030303UL)\
520 h1= ((a&0xFCFCFCFCUL)>>2)\
521 + ((b&0xFCFCFCFCUL)>>2);\
522 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
527 l0= (a&0x03030303UL)\
530 h0= ((a&0xFCFCFCFCUL)>>2)\
531 + ((b&0xFCFCFCFCUL)>>2);\
532 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
536 pixels+=4-line_size*(h+1);\
537 block +=4-line_size*h;\
541 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
546 const uint32_t a= LD32(pixels );\
547 const uint32_t b= LD32(pixels+1);\
548 uint32_t l0= (a&0x03030303UL)\
551 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
552 + ((b&0xFCFCFCFCUL)>>2);\
556 for(i=0; i<h; i+=2){\
557 uint32_t a= LD32(pixels );\
558 uint32_t b= LD32(pixels+1);\
559 l1= (a&0x03030303UL)\
561 h1= ((a&0xFCFCFCFCUL)>>2)\
562 + ((b&0xFCFCFCFCUL)>>2);\
563 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
568 l0= (a&0x03030303UL)\
571 h0= ((a&0xFCFCFCFCUL)>>2)\
572 + ((b&0xFCFCFCFCUL)>>2);\
573 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
577 pixels+=4-line_size*(h+1);\
578 block +=4-line_size*h;\
582 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
584 OPNAME ## _pixels_x2,\
585 OPNAME ## _pixels_y2,\
586 OPNAME ## _pixels_xy2,\
589 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
591 OPNAME ## _no_rnd_pixels_x2,\
592 OPNAME ## _no_rnd_pixels_y2,\
593 OPNAME ## _no_rnd_pixels_xy2,\
595 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
598 #define op_put(a, b) a = b
606 /* FIXME this stuff could be removed as its ot really used anymore */
607 #define PIXOP(BTYPE, OPNAME, OP, INCR) \
609 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
630 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
638 OP(p[0], avg2(pix[0], pix[1])); \
639 OP(p[1], avg2(pix[1], pix[2])); \
640 OP(p[2], avg2(pix[2], pix[3])); \
641 OP(p[3], avg2(pix[3], pix[4])); \
642 OP(p[4], avg2(pix[4], pix[5])); \
643 OP(p[5], avg2(pix[5], pix[6])); \
644 OP(p[6], avg2(pix[6], pix[7])); \
645 OP(p[7], avg2(pix[7], pix[8])); \
651 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
659 pix1 = pixels + line_size; \
661 OP(p[0], avg2(pix[0], pix1[0])); \
662 OP(p[1], avg2(pix[1], pix1[1])); \
663 OP(p[2], avg2(pix[2], pix1[2])); \
664 OP(p[3], avg2(pix[3], pix1[3])); \
665 OP(p[4], avg2(pix[4], pix1[4])); \
666 OP(p[5], avg2(pix[5], pix1[5])); \
667 OP(p[6], avg2(pix[6], pix1[6])); \
668 OP(p[7], avg2(pix[7], pix1[7])); \
675 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h) \
683 pix1 = pixels + line_size; \
685 OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1])); \
686 OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2])); \
687 OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3])); \
688 OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4])); \
689 OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5])); \
690 OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6])); \
691 OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7])); \
692 OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8])); \
699 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
701 OPNAME ## _pixels_x2, \
702 OPNAME ## _pixels_y2, \
703 OPNAME ## _pixels_xy2, \
706 /* rounding primitives */
707 #define avg2(a,b) ((a+b+1)>>1)
708 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
710 #define op_avg(a, b) a = avg2(a, b)
711 #define op_sub(a, b) a -= b
713 PIXOP(DCTELEM
, sub
, op_sub
, 8)
715 /* not rounding primitives */
718 #define avg2(a,b) ((a+b)>>1)
719 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
721 /* motion estimation */
727 #define avg2(a,b) ((a+b+1)>>1)
728 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
730 static void gmc1_c(UINT8
*dst
, UINT8
*src
, int srcStride
, int h
, int x16
, int y16
, int rounder
)
732 const int A
=(16-x16
)*(16-y16
);
733 const int B
=( x16
)*(16-y16
);
734 const int C
=(16-x16
)*( y16
);
735 const int D
=( x16
)*( y16
);
737 rounder
= 128 - rounder
;
741 dst
[0]= (A
*src
[0] + B
*src
[1] + C
*src
[srcStride
+0] + D
*src
[srcStride
+1] + rounder
)>>8;
742 dst
[1]= (A
*src
[1] + B
*src
[2] + C
*src
[srcStride
+1] + D
*src
[srcStride
+2] + rounder
)>>8;
743 dst
[2]= (A
*src
[2] + B
*src
[3] + C
*src
[srcStride
+2] + D
*src
[srcStride
+3] + rounder
)>>8;
744 dst
[3]= (A
*src
[3] + B
*src
[4] + C
*src
[srcStride
+3] + D
*src
[srcStride
+4] + rounder
)>>8;
745 dst
[4]= (A
*src
[4] + B
*src
[5] + C
*src
[srcStride
+4] + D
*src
[srcStride
+5] + rounder
)>>8;
746 dst
[5]= (A
*src
[5] + B
*src
[6] + C
*src
[srcStride
+5] + D
*src
[srcStride
+6] + rounder
)>>8;
747 dst
[6]= (A
*src
[6] + B
*src
[7] + C
*src
[srcStride
+6] + D
*src
[srcStride
+7] + rounder
)>>8;
748 dst
[7]= (A
*src
[7] + B
*src
[8] + C
*src
[srcStride
+7] + D
*src
[srcStride
+8] + rounder
)>>8;
754 static void qpel_h_lowpass(UINT8
*dst
, UINT8
*src
, int dstStride
, int srcStride
, int h
, int r
)
756 UINT8
*cm
= cropTbl
+ MAX_NEG_CROP
;
760 dst
[0]= cm
[(((src
[0]+src
[1])*20 - (src
[0]+src
[2])*6 + (src
[1]+src
[3])*3 - (src
[2]+src
[4]) + r
)>>5)];
761 dst
[1]= cm
[(((src
[1]+src
[2])*20 - (src
[0]+src
[3])*6 + (src
[0]+src
[4])*3 - (src
[1]+src
[5]) + r
)>>5)];
762 dst
[2]= cm
[(((src
[2]+src
[3])*20 - (src
[1]+src
[4])*6 + (src
[0]+src
[5])*3 - (src
[0]+src
[6]) + r
)>>5)];
763 dst
[3]= cm
[(((src
[3]+src
[4])*20 - (src
[2]+src
[5])*6 + (src
[1]+src
[6])*3 - (src
[0]+src
[7]) + r
)>>5)];
764 dst
[4]= cm
[(((src
[4]+src
[5])*20 - (src
[3]+src
[6])*6 + (src
[2]+src
[7])*3 - (src
[1]+src
[8]) + r
)>>5)];
765 dst
[5]= cm
[(((src
[5]+src
[6])*20 - (src
[4]+src
[7])*6 + (src
[3]+src
[8])*3 - (src
[2]+src
[8]) + r
)>>5)];
766 dst
[6]= cm
[(((src
[6]+src
[7])*20 - (src
[5]+src
[8])*6 + (src
[4]+src
[8])*3 - (src
[3]+src
[7]) + r
)>>5)];
767 dst
[7]= cm
[(((src
[7]+src
[8])*20 - (src
[6]+src
[8])*6 + (src
[5]+src
[7])*3 - (src
[4]+src
[6]) + r
)>>5)];
773 static void qpel_v_lowpass(UINT8
*dst
, UINT8
*src
, int dstStride
, int srcStride
, int w
, int r
)
775 UINT8
*cm
= cropTbl
+ MAX_NEG_CROP
;
779 const int src0
= src
[0*srcStride
];
780 const int src1
= src
[1*srcStride
];
781 const int src2
= src
[2*srcStride
];
782 const int src3
= src
[3*srcStride
];
783 const int src4
= src
[4*srcStride
];
784 const int src5
= src
[5*srcStride
];
785 const int src6
= src
[6*srcStride
];
786 const int src7
= src
[7*srcStride
];
787 const int src8
= src
[8*srcStride
];
788 dst
[0*dstStride
]= cm
[(((src0
+src1
)*20 - (src0
+src2
)*6 + (src1
+src3
)*3 - (src2
+src4
) + r
)>>5)];
789 dst
[1*dstStride
]= cm
[(((src1
+src2
)*20 - (src0
+src3
)*6 + (src0
+src4
)*3 - (src1
+src5
) + r
)>>5)];
790 dst
[2*dstStride
]= cm
[(((src2
+src3
)*20 - (src1
+src4
)*6 + (src0
+src5
)*3 - (src0
+src6
) + r
)>>5)];
791 dst
[3*dstStride
]= cm
[(((src3
+src4
)*20 - (src2
+src5
)*6 + (src1
+src6
)*3 - (src0
+src7
) + r
)>>5)];
792 dst
[4*dstStride
]= cm
[(((src4
+src5
)*20 - (src3
+src6
)*6 + (src2
+src7
)*3 - (src1
+src8
) + r
)>>5)];
793 dst
[5*dstStride
]= cm
[(((src5
+src6
)*20 - (src4
+src7
)*6 + (src3
+src8
)*3 - (src2
+src8
) + r
)>>5)];
794 dst
[6*dstStride
]= cm
[(((src6
+src7
)*20 - (src5
+src8
)*6 + (src4
+src8
)*3 - (src3
+src7
) + r
)>>5)];
795 dst
[7*dstStride
]= cm
[(((src7
+src8
)*20 - (src6
+src8
)*6 + (src5
+src7
)*3 - (src4
+src6
) + r
)>>5)];
801 static inline void put_block(UINT8
*dst
, UINT8
*src
, int dstStride
, int srcStride
)
819 static inline void avg2_block(UINT8
*dst
, UINT8
*src1
, UINT8
*src2
, int dstStride
, int srcStride
, int r
)
824 dst
[0]= (src1
[0] + src2
[0] + r
)>>1;
825 dst
[1]= (src1
[1] + src2
[1] + r
)>>1;
826 dst
[2]= (src1
[2] + src2
[2] + r
)>>1;
827 dst
[3]= (src1
[3] + src2
[3] + r
)>>1;
828 dst
[4]= (src1
[4] + src2
[4] + r
)>>1;
829 dst
[5]= (src1
[5] + src2
[5] + r
)>>1;
830 dst
[6]= (src1
[6] + src2
[6] + r
)>>1;
831 dst
[7]= (src1
[7] + src2
[7] + r
)>>1;
838 static inline void avg4_block(UINT8
*dst
, UINT8
*src1
, UINT8
*src2
, UINT8
*src3
, UINT8
*src4
, int dstStride
, int srcStride
, int r
)
843 dst
[0]= (src1
[0] + src2
[0] + src3
[0] + src4
[0] + r
)>>2;
844 dst
[1]= (src1
[1] + src2
[1] + src3
[1] + src4
[1] + r
)>>2;
845 dst
[2]= (src1
[2] + src2
[2] + src3
[2] + src4
[2] + r
)>>2;
846 dst
[3]= (src1
[3] + src2
[3] + src3
[3] + src4
[3] + r
)>>2;
847 dst
[4]= (src1
[4] + src2
[4] + src3
[4] + src4
[4] + r
)>>2;
848 dst
[5]= (src1
[5] + src2
[5] + src3
[5] + src4
[5] + r
)>>2;
849 dst
[6]= (src1
[6] + src2
[6] + src3
[6] + src4
[6] + r
)>>2;
850 dst
[7]= (src1
[7] + src2
[7] + src3
[7] + src4
[7] + r
)>>2;
859 #define QPEL_MC(r, name) \
860 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
862 put_block(dst, src, dstStride, srcStride);\
865 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
868 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
869 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
872 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
874 qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
877 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
880 qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
881 avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
884 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
887 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
888 avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
891 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
893 qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
896 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
899 qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
900 avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
902 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
907 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
908 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
909 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
910 avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
912 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
917 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
918 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
919 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
920 avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
922 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
927 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
928 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
929 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
930 avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
932 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
937 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
938 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
939 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
940 avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
942 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
946 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
947 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
948 avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
950 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
954 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
955 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
956 avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
958 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
963 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
964 qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
965 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
966 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
968 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
973 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
974 qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
975 qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
976 avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
978 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
981 qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
982 qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
984 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
985 qpel_mc00_c ## name, \
986 qpel_mc10_c ## name, \
987 qpel_mc20_c ## name, \
988 qpel_mc30_c ## name, \
989 qpel_mc01_c ## name, \
990 qpel_mc11_c ## name, \
991 qpel_mc21_c ## name, \
992 qpel_mc31_c ## name, \
993 qpel_mc02_c ## name, \
994 qpel_mc12_c ## name, \
995 qpel_mc22_c ## name, \
996 qpel_mc32_c ## name, \
997 qpel_mc03_c ## name, \
998 qpel_mc13_c ## name, \
999 qpel_mc23_c ## name, \
1000 qpel_mc33_c ## name, \
1006 int pix_abs16x16_c(UINT8
*pix1
, UINT8
*pix2
, int line_size
)
1012 s
+= abs(pix1
[0] - pix2
[0]);
1013 s
+= abs(pix1
[1] - pix2
[1]);
1014 s
+= abs(pix1
[2] - pix2
[2]);
1015 s
+= abs(pix1
[3] - pix2
[3]);
1016 s
+= abs(pix1
[4] - pix2
[4]);
1017 s
+= abs(pix1
[5] - pix2
[5]);
1018 s
+= abs(pix1
[6] - pix2
[6]);
1019 s
+= abs(pix1
[7] - pix2
[7]);
1020 s
+= abs(pix1
[8] - pix2
[8]);
1021 s
+= abs(pix1
[9] - pix2
[9]);
1022 s
+= abs(pix1
[10] - pix2
[10]);
1023 s
+= abs(pix1
[11] - pix2
[11]);
1024 s
+= abs(pix1
[12] - pix2
[12]);
1025 s
+= abs(pix1
[13] - pix2
[13]);
1026 s
+= abs(pix1
[14] - pix2
[14]);
1027 s
+= abs(pix1
[15] - pix2
[15]);
1034 int pix_abs16x16_x2_c(UINT8
*pix1
, UINT8
*pix2
, int line_size
)
1040 s
+= abs(pix1
[0] - avg2(pix2
[0], pix2
[1]));
1041 s
+= abs(pix1
[1] - avg2(pix2
[1], pix2
[2]));
1042 s
+= abs(pix1
[2] - avg2(pix2
[2], pix2
[3]));
1043 s
+= abs(pix1
[3] - avg2(pix2
[3], pix2
[4]));
1044 s
+= abs(pix1
[4] - avg2(pix2
[4], pix2
[5]));
1045 s
+= abs(pix1
[5] - avg2(pix2
[5], pix2
[6]));
1046 s
+= abs(pix1
[6] - avg2(pix2
[6], pix2
[7]));
1047 s
+= abs(pix1
[7] - avg2(pix2
[7], pix2
[8]));
1048 s
+= abs(pix1
[8] - avg2(pix2
[8], pix2
[9]));
1049 s
+= abs(pix1
[9] - avg2(pix2
[9], pix2
[10]));
1050 s
+= abs(pix1
[10] - avg2(pix2
[10], pix2
[11]));
1051 s
+= abs(pix1
[11] - avg2(pix2
[11], pix2
[12]));
1052 s
+= abs(pix1
[12] - avg2(pix2
[12], pix2
[13]));
1053 s
+= abs(pix1
[13] - avg2(pix2
[13], pix2
[14]));
1054 s
+= abs(pix1
[14] - avg2(pix2
[14], pix2
[15]));
1055 s
+= abs(pix1
[15] - avg2(pix2
[15], pix2
[16]));
1062 int pix_abs16x16_y2_c(UINT8
*pix1
, UINT8
*pix2
, int line_size
)
1065 UINT8
*pix3
= pix2
+ line_size
;
1069 s
+= abs(pix1
[0] - avg2(pix2
[0], pix3
[0]));
1070 s
+= abs(pix1
[1] - avg2(pix2
[1], pix3
[1]));
1071 s
+= abs(pix1
[2] - avg2(pix2
[2], pix3
[2]));
1072 s
+= abs(pix1
[3] - avg2(pix2
[3], pix3
[3]));
1073 s
+= abs(pix1
[4] - avg2(pix2
[4], pix3
[4]));
1074 s
+= abs(pix1
[5] - avg2(pix2
[5], pix3
[5]));
1075 s
+= abs(pix1
[6] - avg2(pix2
[6], pix3
[6]));
1076 s
+= abs(pix1
[7] - avg2(pix2
[7], pix3
[7]));
1077 s
+= abs(pix1
[8] - avg2(pix2
[8], pix3
[8]));
1078 s
+= abs(pix1
[9] - avg2(pix2
[9], pix3
[9]));
1079 s
+= abs(pix1
[10] - avg2(pix2
[10], pix3
[10]));
1080 s
+= abs(pix1
[11] - avg2(pix2
[11], pix3
[11]));
1081 s
+= abs(pix1
[12] - avg2(pix2
[12], pix3
[12]));
1082 s
+= abs(pix1
[13] - avg2(pix2
[13], pix3
[13]));
1083 s
+= abs(pix1
[14] - avg2(pix2
[14], pix3
[14]));
1084 s
+= abs(pix1
[15] - avg2(pix2
[15], pix3
[15]));
1092 int pix_abs16x16_xy2_c(UINT8
*pix1
, UINT8
*pix2
, int line_size
)
1095 UINT8
*pix3
= pix2
+ line_size
;
1099 s
+= abs(pix1
[0] - avg4(pix2
[0], pix2
[1], pix3
[0], pix3
[1]));
1100 s
+= abs(pix1
[1] - avg4(pix2
[1], pix2
[2], pix3
[1], pix3
[2]));
1101 s
+= abs(pix1
[2] - avg4(pix2
[2], pix2
[3], pix3
[2], pix3
[3]));
1102 s
+= abs(pix1
[3] - avg4(pix2
[3], pix2
[4], pix3
[3], pix3
[4]));
1103 s
+= abs(pix1
[4] - avg4(pix2
[4], pix2
[5], pix3
[4], pix3
[5]));
1104 s
+= abs(pix1
[5] - avg4(pix2
[5], pix2
[6], pix3
[5], pix3
[6]));
1105 s
+= abs(pix1
[6] - avg4(pix2
[6], pix2
[7], pix3
[6], pix3
[7]));
1106 s
+= abs(pix1
[7] - avg4(pix2
[7], pix2
[8], pix3
[7], pix3
[8]));
1107 s
+= abs(pix1
[8] - avg4(pix2
[8], pix2
[9], pix3
[8], pix3
[9]));
1108 s
+= abs(pix1
[9] - avg4(pix2
[9], pix2
[10], pix3
[9], pix3
[10]));
1109 s
+= abs(pix1
[10] - avg4(pix2
[10], pix2
[11], pix3
[10], pix3
[11]));
1110 s
+= abs(pix1
[11] - avg4(pix2
[11], pix2
[12], pix3
[11], pix3
[12]));
1111 s
+= abs(pix1
[12] - avg4(pix2
[12], pix2
[13], pix3
[12], pix3
[13]));
1112 s
+= abs(pix1
[13] - avg4(pix2
[13], pix2
[14], pix3
[13], pix3
[14]));
1113 s
+= abs(pix1
[14] - avg4(pix2
[14], pix2
[15], pix3
[14], pix3
[15]));
1114 s
+= abs(pix1
[15] - avg4(pix2
[15], pix2
[16], pix3
[15], pix3
[16]));
1122 int pix_abs8x8_c(UINT8
*pix1
, UINT8
*pix2
, int line_size
)
1128 s
+= abs(pix1
[0] - pix2
[0]);
1129 s
+= abs(pix1
[1] - pix2
[1]);
1130 s
+= abs(pix1
[2] - pix2
[2]);
1131 s
+= abs(pix1
[3] - pix2
[3]);
1132 s
+= abs(pix1
[4] - pix2
[4]);
1133 s
+= abs(pix1
[5] - pix2
[5]);
1134 s
+= abs(pix1
[6] - pix2
[6]);
1135 s
+= abs(pix1
[7] - pix2
[7]);
1142 int pix_abs8x8_x2_c(UINT8
*pix1
, UINT8
*pix2
, int line_size
)
1148 s
+= abs(pix1
[0] - avg2(pix2
[0], pix2
[1]));
1149 s
+= abs(pix1
[1] - avg2(pix2
[1], pix2
[2]));
1150 s
+= abs(pix1
[2] - avg2(pix2
[2], pix2
[3]));
1151 s
+= abs(pix1
[3] - avg2(pix2
[3], pix2
[4]));
1152 s
+= abs(pix1
[4] - avg2(pix2
[4], pix2
[5]));
1153 s
+= abs(pix1
[5] - avg2(pix2
[5], pix2
[6]));
1154 s
+= abs(pix1
[6] - avg2(pix2
[6], pix2
[7]));
1155 s
+= abs(pix1
[7] - avg2(pix2
[7], pix2
[8]));
1162 int pix_abs8x8_y2_c(UINT8
*pix1
, UINT8
*pix2
, int line_size
)
1165 UINT8
*pix3
= pix2
+ line_size
;
1169 s
+= abs(pix1
[0] - avg2(pix2
[0], pix3
[0]));
1170 s
+= abs(pix1
[1] - avg2(pix2
[1], pix3
[1]));
1171 s
+= abs(pix1
[2] - avg2(pix2
[2], pix3
[2]));
1172 s
+= abs(pix1
[3] - avg2(pix2
[3], pix3
[3]));
1173 s
+= abs(pix1
[4] - avg2(pix2
[4], pix3
[4]));
1174 s
+= abs(pix1
[5] - avg2(pix2
[5], pix3
[5]));
1175 s
+= abs(pix1
[6] - avg2(pix2
[6], pix3
[6]));
1176 s
+= abs(pix1
[7] - avg2(pix2
[7], pix3
[7]));
1184 int pix_abs8x8_xy2_c(UINT8
*pix1
, UINT8
*pix2
, int line_size
)
1187 UINT8
*pix3
= pix2
+ line_size
;
1191 s
+= abs(pix1
[0] - avg4(pix2
[0], pix2
[1], pix3
[0], pix3
[1]));
1192 s
+= abs(pix1
[1] - avg4(pix2
[1], pix2
[2], pix3
[1], pix3
[2]));
1193 s
+= abs(pix1
[2] - avg4(pix2
[2], pix2
[3], pix3
[2], pix3
[3]));
1194 s
+= abs(pix1
[3] - avg4(pix2
[3], pix2
[4], pix3
[3], pix3
[4]));
1195 s
+= abs(pix1
[4] - avg4(pix2
[4], pix2
[5], pix3
[4], pix3
[5]));
1196 s
+= abs(pix1
[5] - avg4(pix2
[5], pix2
[6], pix3
[5], pix3
[6]));
1197 s
+= abs(pix1
[6] - avg4(pix2
[6], pix2
[7], pix3
[6], pix3
[7]));
1198 s
+= abs(pix1
[7] - avg4(pix2
[7], pix2
[8], pix3
[7], pix3
[8]));
1206 /* permute block according so that it corresponds to the MMX idct
1209 /* general permutation, but perhaps slightly slower */
1210 void block_permute(INT16
*block
)
1215 for(i
=0; i
<64; i
++) temp
[ block_permute_op(i
) ] = block
[i
];
1217 for(i
=0; i
<64; i
++) block
[i
] = temp
[i
];
1221 void block_permute(INT16
*block
)
1223 int tmp1
, tmp2
, tmp3
, tmp4
, tmp5
, tmp6
;
1244 void clear_blocks_c(DCTELEM
*blocks
)
1246 memset(blocks
, 0, sizeof(DCTELEM
)*6*64);
1249 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1251 void gen_idct_put(UINT8
*dest
, int line_size
, DCTELEM
*block
)
1254 put_pixels_clamped(block
, dest
, line_size
);
1257 void gen_idct_add(UINT8
*dest
, int line_size
, DCTELEM
*block
)
1260 add_pixels_clamped(block
, dest
, line_size
);
1263 void dsputil_init(void)
1266 int use_permuted_idct
;
1268 for(i
=0;i
<256;i
++) cropTbl
[i
+ MAX_NEG_CROP
] = i
;
1269 for(i
=0;i
<MAX_NEG_CROP
;i
++) {
1271 cropTbl
[i
+ MAX_NEG_CROP
+ 256] = 255;
1274 for(i
=0;i
<512;i
++) {
1275 squareTbl
[i
] = (i
- 256) * (i
- 256);
1281 ff_idct
= j_rev_dct
;
1283 get_pixels
= get_pixels_c
;
1284 diff_pixels
= diff_pixels_c
;
1285 put_pixels_clamped
= put_pixels_clamped_c
;
1286 add_pixels_clamped
= add_pixels_clamped_c
;
1288 clear_blocks
= clear_blocks_c
;
1290 pix_abs16x16
= pix_abs16x16_c
;
1291 pix_abs16x16_x2
= pix_abs16x16_x2_c
;
1292 pix_abs16x16_y2
= pix_abs16x16_y2_c
;
1293 pix_abs16x16_xy2
= pix_abs16x16_xy2_c
;
1294 pix_abs8x8
= pix_abs8x8_c
;
1295 pix_abs8x8_x2
= pix_abs8x8_x2_c
;
1296 pix_abs8x8_y2
= pix_abs8x8_y2_c
;
1297 pix_abs8x8_xy2
= pix_abs8x8_xy2_c
;
1298 av_fdct
= fdct_ifast
;
1300 use_permuted_idct
= 1;
1306 dsputil_init_armv4l();
1309 dsputil_init_mlib();
1310 use_permuted_idct
= 0;
1313 dsputil_init_alpha();
1314 use_permuted_idct
= 0;
1318 if (ff_idct
== NULL
) {
1319 ff_idct_put
= simple_idct_put
;
1320 ff_idct_add
= simple_idct_add
;
1321 use_permuted_idct
=0;
1323 ff_idct_put
= gen_idct_put
;
1324 ff_idct_add
= gen_idct_add
;
1328 if(use_permuted_idct
)
1330 for(i
=0; i
<64; i
++) permutation
[i
]= simple_mmx_permutation
[i
];
1332 for(i
=0; i
<64; i
++) permutation
[i
]= (i
& 0x38) | ((i
& 6) >> 1) | ((i
& 1) << 2);
1335 for(i
=0; i
<64; i
++) permutation
[i
]=i
;
1337 for(i
=0; i
<64; i
++) inv_zigzag_direct16
[zigzag_direct
[i
]]= i
+1;
1338 for(i
=0; i
<64; i
++) zigzag_direct_noperm
[i
]= zigzag_direct
[i
];
1340 if (use_permuted_idct
) {
1341 /* permute for IDCT */
1343 j
= zigzag_direct
[i
];
1344 zigzag_direct
[i
] = block_permute_op(j
);
1345 j
= ff_alternate_horizontal_scan
[i
];
1346 ff_alternate_horizontal_scan
[i
] = block_permute_op(j
);
1347 j
= ff_alternate_vertical_scan
[i
];
1348 ff_alternate_vertical_scan
[i
] = block_permute_op(j
);
1350 block_permute(default_intra_matrix
);
1351 block_permute(default_non_intra_matrix
);
1352 block_permute(ff_mpeg4_default_intra_matrix
);
1353 block_permute(ff_mpeg4_default_non_intra_matrix
);
1359 /* remove any non bit exact operation (testing purpose) */
1360 void avcodec_set_bit_exact(void)
1363 dsputil_set_bit_exact_mmx();
1367 void get_psnr(UINT8
*orig_image
[3], UINT8
*coded_image
[3],
1368 int orig_linesize
[3], int coded_linesize
,
1369 AVCodecContext
*avctx
)
1371 int quad
, diff
, x
, y
;
1372 UINT8
*orig
, *coded
;
1373 UINT32
*sq
= squareTbl
+ 256;
1379 orig
= orig_image
[0];
1380 coded
= coded_image
[0];
1382 for (y
=0;y
<avctx
->height
;y
++) {
1383 for (x
=0;x
<avctx
->width
;x
++) {
1384 diff
= *(orig
+ x
) - *(coded
+ x
);
1387 orig
+= orig_linesize
[0];
1388 coded
+= coded_linesize
;
1391 avctx
->psnr_y
= (float) quad
/ (float) (avctx
->width
* avctx
->height
);
1393 if (avctx
->psnr_y
) {
1394 avctx
->psnr_y
= (float) (255 * 255) / avctx
->psnr_y
;
1395 avctx
->psnr_y
= 10 * (float) log10 (avctx
->psnr_y
);
1397 avctx
->psnr_y
= 99.99;